aboutsummaryrefslogtreecommitdiff
path: root/llvm/test
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test')
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll361
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll126
-rw-r--r--llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll72
3 files changed, 208 insertions, 351 deletions
diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
index d4cc154..52ca22b 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vcvt_f.ll
@@ -1,38 +1,24 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,GENERIC
-; RUN: llc < %s -O0 -fast-isel -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,FAST
-; RUN: llc < %s -global-isel -global-isel-abort=2 -pass-remarks-missed=gisel* \
-; RUN: -mtriple=arm64-eabi -aarch64-neon-syntax=apple \
-; RUN: | FileCheck %s --check-prefixes=GISEL,FALLBACK
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple | FileCheck %s --check-prefixes=CHECK,CHECK-SD
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -O0 -fast-isel | FileCheck %s --check-prefixes=CHECK,CHECK-FI
+; RUN: llc < %s -mtriple=arm64-eabi -aarch64-neon-syntax=apple -global-isel -global-isel-abort=2 2>&1 | FileCheck %s --check-prefixes=CHECK,CHECK-GI
+
+; CHECK-GI: warning: Instruction selection used fallback path for test_vcvt_bf16_f64
-; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_f64_f32)
-; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_f64_f32)
define <2 x double> @test_vcvt_f64_f32(<2 x float> %x) nounwind readnone ssp {
; CHECK-LABEL: test_vcvt_f64_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: fcvtl v0.2d, v0.2s
; CHECK-NEXT: ret
-;
-; GISEL-LABEL: test_vcvt_f64_f32:
-; GISEL: // %bb.0:
-; GISEL-NEXT: fcvtl v0.2d, v0.2s
-; GISEL-NEXT: ret
%vcvt1.i = fpext <2 x float> %x to <2 x double>
ret <2 x double> %vcvt1.i
}
-; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_high_f64_f32)
-; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_high_f64_f32)
define <2 x double> @test_vcvt_high_f64_f32(<4 x float> %x) nounwind readnone ssp {
; CHECK-LABEL: test_vcvt_high_f64_f32:
; CHECK: // %bb.0:
; CHECK-NEXT: fcvtl2 v0.2d, v0.4s
; CHECK-NEXT: ret
-;
-; GISEL-LABEL: test_vcvt_high_f64_f32:
-; GISEL: // %bb.0:
-; GISEL-NEXT: fcvtl2 v0.2d, v0.4s
-; GISEL-NEXT: ret
%cvt_in = shufflevector <4 x float> %x, <4 x float> undef, <2 x i32> <i32 2, i32 3>
%vcvt1.i = fpext <2 x float> %cvt_in to <2 x double>
ret <2 x double> %vcvt1.i
@@ -43,11 +29,6 @@ define <2 x double> @test_vcvt_high_v1f64_f32_bitcast(<4 x float> %x) nounwind r
; CHECK: // %bb.0:
; CHECK-NEXT: fcvtl2 v0.2d, v0.4s
; CHECK-NEXT: ret
-;
-; GISEL-LABEL: test_vcvt_high_v1f64_f32_bitcast:
-; GISEL: // %bb.0:
-; GISEL-NEXT: fcvtl2 v0.2d, v0.4s
-; GISEL-NEXT: ret
%bc1 = bitcast <4 x float> %x to <2 x double>
%ext = shufflevector <2 x double> %bc1, <2 x double> undef, <1 x i32> <i32 1>
%bc2 = bitcast <1 x double> %ext to <2 x float>
@@ -60,11 +41,6 @@ define <2 x double> @test_vcvt_high_v1i64_f32_bitcast(<2 x i64> %x) nounwind rea
; CHECK: // %bb.0:
; CHECK-NEXT: fcvtl2 v0.2d, v0.4s
; CHECK-NEXT: ret
-;
-; GISEL-LABEL: test_vcvt_high_v1i64_f32_bitcast:
-; GISEL: // %bb.0:
-; GISEL-NEXT: fcvtl2 v0.2d, v0.4s
-; GISEL-NEXT: ret
%ext = shufflevector <2 x i64> %x, <2 x i64> undef, <1 x i32> <i32 1>
%bc2 = bitcast <1 x i64> %ext to <2 x float>
%r = fpext <2 x float> %bc2 to <2 x double>
@@ -76,11 +52,6 @@ define <2 x double> @test_vcvt_high_v2i32_f32_bitcast(<4 x i32> %x) nounwind rea
; CHECK: // %bb.0:
; CHECK-NEXT: fcvtl2 v0.2d, v0.4s
; CHECK-NEXT: ret
-;
-; GISEL-LABEL: test_vcvt_high_v2i32_f32_bitcast:
-; GISEL: // %bb.0:
-; GISEL-NEXT: fcvtl2 v0.2d, v0.4s
-; GISEL-NEXT: ret
%ext = shufflevector <4 x i32> %x, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%bc2 = bitcast <2 x i32> %ext to <2 x float>
%r = fpext <2 x float> %bc2 to <2 x double>
@@ -92,11 +63,6 @@ define <2 x double> @test_vcvt_high_v4i16_f32_bitcast(<8 x i16> %x) nounwind rea
; CHECK: // %bb.0:
; CHECK-NEXT: fcvtl2 v0.2d, v0.4s
; CHECK-NEXT: ret
-;
-; GISEL-LABEL: test_vcvt_high_v4i16_f32_bitcast:
-; GISEL: // %bb.0:
-; GISEL-NEXT: fcvtl2 v0.2d, v0.4s
-; GISEL-NEXT: ret
%ext = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%bc2 = bitcast <4 x i16> %ext to <2 x float>
%r = fpext <2 x float> %bc2 to <2 x double>
@@ -108,11 +74,6 @@ define <2 x double> @test_vcvt_high_v8i8_f32_bitcast(<16 x i8> %x) nounwind read
; CHECK: // %bb.0:
; CHECK-NEXT: fcvtl2 v0.2d, v0.4s
; CHECK-NEXT: ret
-;
-; GISEL-LABEL: test_vcvt_high_v8i8_f32_bitcast:
-; GISEL: // %bb.0:
-; GISEL-NEXT: fcvtl2 v0.2d, v0.4s
-; GISEL-NEXT: ret
%ext = shufflevector <16 x i8> %x, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%bc2 = bitcast <8 x i8> %ext to <2 x float>
%r = fpext <2 x float> %bc2 to <2 x double>
@@ -124,11 +85,6 @@ define <4 x float> @test_vcvt_high_v1i64_f16_bitcast(<2 x i64> %x) nounwind read
; CHECK: // %bb.0:
; CHECK-NEXT: fcvtl2 v0.4s, v0.8h
; CHECK-NEXT: ret
-;
-; GISEL-LABEL: test_vcvt_high_v1i64_f16_bitcast:
-; GISEL: // %bb.0:
-; GISEL-NEXT: fcvtl2 v0.4s, v0.8h
-; GISEL-NEXT: ret
%ext = shufflevector <2 x i64> %x, <2 x i64> undef, <1 x i32> <i32 1>
%bc2 = bitcast <1 x i64> %ext to <4 x half>
%r = fpext <4 x half> %bc2 to <4 x float>
@@ -140,11 +96,6 @@ define <4 x float> @test_vcvt_high_v2i32_f16_bitcast(<4 x i32> %x) nounwind read
; CHECK: // %bb.0:
; CHECK-NEXT: fcvtl2 v0.4s, v0.8h
; CHECK-NEXT: ret
-;
-; GISEL-LABEL: test_vcvt_high_v2i32_f16_bitcast:
-; GISEL: // %bb.0:
-; GISEL-NEXT: fcvtl2 v0.4s, v0.8h
-; GISEL-NEXT: ret
%ext = shufflevector <4 x i32> %x, <4 x i32> undef, <2 x i32> <i32 2, i32 3>
%bc2 = bitcast <2 x i32> %ext to <4 x half>
%r = fpext <4 x half> %bc2 to <4 x float>
@@ -156,11 +107,6 @@ define <4 x float> @test_vcvt_high_v4i16_f16_bitcast(<8 x i16> %x) nounwind read
; CHECK: // %bb.0:
; CHECK-NEXT: fcvtl2 v0.4s, v0.8h
; CHECK-NEXT: ret
-;
-; GISEL-LABEL: test_vcvt_high_v4i16_f16_bitcast:
-; GISEL: // %bb.0:
-; GISEL-NEXT: fcvtl2 v0.4s, v0.8h
-; GISEL-NEXT: ret
%ext = shufflevector <8 x i16> %x, <8 x i16> undef, <4 x i32> <i32 4, i32 5, i32 6, i32 7>
%bc2 = bitcast <4 x i16> %ext to <4 x half>
%r = fpext <4 x half> %bc2 to <4 x float>
@@ -172,134 +118,118 @@ define <4 x float> @test_vcvt_high_v8i8_f16_bitcast(<16 x i8> %x) nounwind readn
; CHECK: // %bb.0:
; CHECK-NEXT: fcvtl2 v0.4s, v0.8h
; CHECK-NEXT: ret
-;
-; GISEL-LABEL: test_vcvt_high_v8i8_f16_bitcast:
-; GISEL: // %bb.0:
-; GISEL-NEXT: fcvtl2 v0.4s, v0.8h
-; GISEL-NEXT: ret
%ext = shufflevector <16 x i8> %x, <16 x i8> undef, <8 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
%bc2 = bitcast <8 x i8> %ext to <4 x half>
%r = fpext <4 x half> %bc2 to <4 x float>
ret <4 x float> %r
}
-; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_f32_f64)
-; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_f32_f64)
define <2 x float> @test_vcvt_f32_f64(<2 x double> %v) nounwind readnone ssp {
; CHECK-LABEL: test_vcvt_f32_f64:
; CHECK: // %bb.0:
; CHECK-NEXT: fcvtn v0.2s, v0.2d
; CHECK-NEXT: ret
-;
-; GISEL-LABEL: test_vcvt_f32_f64:
-; GISEL: // %bb.0:
-; GISEL-NEXT: fcvtn v0.2s, v0.2d
-; GISEL-NEXT: ret
%vcvt1.i = fptrunc <2 x double> %v to <2 x float>
ret <2 x float> %vcvt1.i
}
-; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_bf16_f64)
-; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_bf16_f64)
define <2 x bfloat> @test_vcvt_bf16_f64(<2 x double> %v) nounwind readnone ssp {
-; GENERIC-LABEL: test_vcvt_bf16_f64:
-; GENERIC: // %bb.0:
-; GENERIC-NEXT: fcvtxn v0.2s, v0.2d
-; GENERIC-NEXT: movi.4s v1, #1
-; GENERIC-NEXT: movi.4s v2, #127, msl #8
-; GENERIC-NEXT: ushr.4s v3, v0, #16
-; GENERIC-NEXT: add.4s v2, v0, v2
-; GENERIC-NEXT: and.16b v1, v3, v1
-; GENERIC-NEXT: fcmeq.4s v3, v0, v0
-; GENERIC-NEXT: orr.4s v0, #64, lsl #16
-; GENERIC-NEXT: add.4s v1, v1, v2
-; GENERIC-NEXT: bit.16b v0, v1, v3
-; GENERIC-NEXT: shrn.4h v0, v0, #16
-; GENERIC-NEXT: ret
+; CHECK-SD-LABEL: test_vcvt_bf16_f64:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: fcvtxn v0.2s, v0.2d
+; CHECK-SD-NEXT: movi.4s v1, #1
+; CHECK-SD-NEXT: movi.4s v2, #127, msl #8
+; CHECK-SD-NEXT: ushr.4s v3, v0, #16
+; CHECK-SD-NEXT: add.4s v2, v0, v2
+; CHECK-SD-NEXT: and.16b v1, v3, v1
+; CHECK-SD-NEXT: fcmeq.4s v3, v0, v0
+; CHECK-SD-NEXT: orr.4s v0, #64, lsl #16
+; CHECK-SD-NEXT: add.4s v1, v1, v2
+; CHECK-SD-NEXT: bit.16b v0, v1, v3
+; CHECK-SD-NEXT: shrn.4h v0, v0, #16
+; CHECK-SD-NEXT: ret
;
-; FAST-LABEL: test_vcvt_bf16_f64:
-; FAST: // %bb.0:
-; FAST-NEXT: fcvtxn v1.2s, v0.2d
-; FAST-NEXT: // implicit-def: $q0
-; FAST-NEXT: fmov d0, d1
-; FAST-NEXT: ushr.4s v1, v0, #16
-; FAST-NEXT: movi.4s v2, #1
-; FAST-NEXT: and.16b v1, v1, v2
-; FAST-NEXT: add.4s v1, v1, v0
-; FAST-NEXT: movi.4s v2, #127, msl #8
-; FAST-NEXT: add.4s v1, v1, v2
-; FAST-NEXT: mov.16b v2, v0
-; FAST-NEXT: orr.4s v2, #64, lsl #16
-; FAST-NEXT: fcmeq.4s v0, v0, v0
-; FAST-NEXT: bsl.16b v0, v1, v2
-; FAST-NEXT: shrn.4h v0, v0, #16
-; FAST-NEXT: ret
+; CHECK-FI-LABEL: test_vcvt_bf16_f64:
+; CHECK-FI: // %bb.0:
+; CHECK-FI-NEXT: fcvtxn v1.2s, v0.2d
+; CHECK-FI-NEXT: // implicit-def: $q0
+; CHECK-FI-NEXT: fmov d0, d1
+; CHECK-FI-NEXT: ushr.4s v1, v0, #16
+; CHECK-FI-NEXT: movi.4s v2, #1
+; CHECK-FI-NEXT: and.16b v1, v1, v2
+; CHECK-FI-NEXT: add.4s v1, v1, v0
+; CHECK-FI-NEXT: movi.4s v2, #127, msl #8
+; CHECK-FI-NEXT: add.4s v1, v1, v2
+; CHECK-FI-NEXT: mov.16b v2, v0
+; CHECK-FI-NEXT: orr.4s v2, #64, lsl #16
+; CHECK-FI-NEXT: fcmeq.4s v0, v0, v0
+; CHECK-FI-NEXT: bsl.16b v0, v1, v2
+; CHECK-FI-NEXT: shrn.4h v0, v0, #16
+; CHECK-FI-NEXT: ret
;
-; GISEL-LABEL: test_vcvt_bf16_f64:
-; GISEL: // %bb.0:
-; GISEL-NEXT: fcvtxn v0.2s, v0.2d
-; GISEL-NEXT: movi.4s v1, #1
-; GISEL-NEXT: movi.4s v2, #127, msl #8
-; GISEL-NEXT: ushr.4s v3, v0, #16
-; GISEL-NEXT: add.4s v2, v0, v2
-; GISEL-NEXT: and.16b v1, v3, v1
-; GISEL-NEXT: fcmeq.4s v3, v0, v0
-; GISEL-NEXT: orr.4s v0, #64, lsl #16
-; GISEL-NEXT: add.4s v1, v1, v2
-; GISEL-NEXT: bit.16b v0, v1, v3
-; GISEL-NEXT: shrn.4h v0, v0, #16
-; GISEL-NEXT: ret
+; CHECK-GI-LABEL: test_vcvt_bf16_f64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: fcvtxn v0.2s, v0.2d
+; CHECK-GI-NEXT: movi.4s v1, #1
+; CHECK-GI-NEXT: movi.4s v2, #127, msl #8
+; CHECK-GI-NEXT: ushr.4s v3, v0, #16
+; CHECK-GI-NEXT: add.4s v2, v0, v2
+; CHECK-GI-NEXT: and.16b v1, v3, v1
+; CHECK-GI-NEXT: fcmeq.4s v3, v0, v0
+; CHECK-GI-NEXT: orr.4s v0, #64, lsl #16
+; CHECK-GI-NEXT: add.4s v1, v1, v2
+; CHECK-GI-NEXT: bit.16b v0, v1, v3
+; CHECK-GI-NEXT: shrn.4h v0, v0, #16
+; CHECK-GI-NEXT: ret
%vcvt1.i = fptrunc <2 x double> %v to <2 x bfloat>
ret <2 x bfloat> %vcvt1.i
}
define half @test_vcvt_f16_f32(<1 x float> %x) {
-; GENERIC-LABEL: test_vcvt_f16_f32:
-; GENERIC: // %bb.0:
-; GENERIC-NEXT: // kill: def $d0 killed $d0 def $q0
-; GENERIC-NEXT: fcvt h0, s0
-; GENERIC-NEXT: ret
+; CHECK-SD-LABEL: test_vcvt_f16_f32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: fcvt h0, s0
+; CHECK-SD-NEXT: ret
;
-; FAST-LABEL: test_vcvt_f16_f32:
-; FAST: // %bb.0:
-; FAST-NEXT: fmov d1, d0
-; FAST-NEXT: // implicit-def: $q0
-; FAST-NEXT: fmov d0, d1
-; FAST-NEXT: // kill: def $s0 killed $s0 killed $q0
-; FAST-NEXT: fcvt h0, s0
-; FAST-NEXT: ret
+; CHECK-FI-LABEL: test_vcvt_f16_f32:
+; CHECK-FI: // %bb.0:
+; CHECK-FI-NEXT: fmov d1, d0
+; CHECK-FI-NEXT: // implicit-def: $q0
+; CHECK-FI-NEXT: fmov d0, d1
+; CHECK-FI-NEXT: // kill: def $s0 killed $s0 killed $q0
+; CHECK-FI-NEXT: fcvt h0, s0
+; CHECK-FI-NEXT: ret
;
-; GISEL-LABEL: test_vcvt_f16_f32:
-; GISEL: // %bb.0:
-; GISEL-NEXT: fcvt h0, s0
-; GISEL-NEXT: ret
+; CHECK-GI-LABEL: test_vcvt_f16_f32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ret
%tmp = fptrunc <1 x float> %x to <1 x half>
%elt = extractelement <1 x half> %tmp, i32 0
ret half %elt
}
-; FALLBACK-NOT: remark{{.*}}G_FPEXT{{.*}}(in function: test_vcvt_high_f32_f64)
-; FALLBACK-NOT: remark{{.*}}fpext{{.*}}(in function: test_vcvt_high_f32_f64)
define <4 x float> @test_vcvt_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
-; GENERIC-LABEL: test_vcvt_high_f32_f64:
-; GENERIC: // %bb.0:
-; GENERIC-NEXT: // kill: def $d0 killed $d0 def $q0
-; GENERIC-NEXT: fcvtn2 v0.4s, v1.2d
-; GENERIC-NEXT: ret
+; CHECK-SD-LABEL: test_vcvt_high_f32_f64:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-SD-NEXT: ret
;
-; FAST-LABEL: test_vcvt_high_f32_f64:
-; FAST: // %bb.0:
-; FAST-NEXT: fmov d2, d0
-; FAST-NEXT: // implicit-def: $q0
-; FAST-NEXT: fmov d0, d2
-; FAST-NEXT: fcvtn2 v0.4s, v1.2d
-; FAST-NEXT: ret
+; CHECK-FI-LABEL: test_vcvt_high_f32_f64:
+; CHECK-FI: // %bb.0:
+; CHECK-FI-NEXT: fmov d2, d0
+; CHECK-FI-NEXT: // implicit-def: $q0
+; CHECK-FI-NEXT: fmov d0, d2
+; CHECK-FI-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-FI-NEXT: ret
;
-; GISEL-LABEL: test_vcvt_high_f32_f64:
-; GISEL: // %bb.0:
-; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT: fcvtn2 v0.4s, v1.2d
-; GISEL-NEXT: ret
+; CHECK-GI-LABEL: test_vcvt_high_f32_f64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: fcvtn2 v0.4s, v1.2d
+; CHECK-GI-NEXT: ret
%cvt = fptrunc <2 x double> %v to <2 x float>
%vcvt2.i = shufflevector <2 x float> %x, <2 x float> %cvt, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x float> %vcvt2.i
@@ -310,99 +240,80 @@ define <2 x float> @test_vcvtx_f32_f64(<2 x double> %v) nounwind readnone ssp {
; CHECK: // %bb.0:
; CHECK-NEXT: fcvtxn v0.2s, v0.2d
; CHECK-NEXT: ret
-;
-; GISEL-LABEL: test_vcvtx_f32_f64:
-; GISEL: // %bb.0:
-; GISEL-NEXT: fcvtxn v0.2s, v0.2d
-; GISEL-NEXT: ret
%vcvtx1.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
ret <2 x float> %vcvtx1.i
}
define <4 x float> @test_vcvtx_high_f32_f64(<2 x float> %x, <2 x double> %v) nounwind readnone ssp {
-; GENERIC-LABEL: test_vcvtx_high_f32_f64:
-; GENERIC: // %bb.0:
-; GENERIC-NEXT: // kill: def $d0 killed $d0 def $q0
-; GENERIC-NEXT: fcvtxn2 v0.4s, v1.2d
-; GENERIC-NEXT: ret
+; CHECK-SD-LABEL: test_vcvtx_high_f32_f64:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: fcvtxn2 v0.4s, v1.2d
+; CHECK-SD-NEXT: ret
;
-; FAST-LABEL: test_vcvtx_high_f32_f64:
-; FAST: // %bb.0:
-; FAST-NEXT: fmov d2, d0
-; FAST-NEXT: // implicit-def: $q0
-; FAST-NEXT: fmov d0, d2
-; FAST-NEXT: fcvtxn2 v0.4s, v1.2d
-; FAST-NEXT: ret
+; CHECK-FI-LABEL: test_vcvtx_high_f32_f64:
+; CHECK-FI: // %bb.0:
+; CHECK-FI-NEXT: fmov d2, d0
+; CHECK-FI-NEXT: // implicit-def: $q0
+; CHECK-FI-NEXT: fmov d0, d2
+; CHECK-FI-NEXT: fcvtxn2 v0.4s, v1.2d
+; CHECK-FI-NEXT: ret
;
-; GISEL-LABEL: test_vcvtx_high_f32_f64:
-; GISEL: // %bb.0:
-; GISEL-NEXT: // kill: def $d0 killed $d0 def $q0
-; GISEL-NEXT: fcvtxn2 v0.4s, v1.2d
-; GISEL-NEXT: ret
+; CHECK-GI-LABEL: test_vcvtx_high_f32_f64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: fcvtxn2 v0.4s, v1.2d
+; CHECK-GI-NEXT: ret
%vcvtx2.i = tail call <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double> %v) nounwind
%res = shufflevector <2 x float> %x, <2 x float> %vcvtx2.i, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
ret <4 x float> %res
}
-
-declare <2 x double> @llvm.aarch64.neon.vcvthighfp2df(<4 x float>) nounwind readnone
-declare <2 x double> @llvm.aarch64.neon.vcvtfp2df(<2 x float>) nounwind readnone
-
-declare <2 x float> @llvm.aarch64.neon.vcvtdf2fp(<2 x double>) nounwind readnone
-declare <4 x float> @llvm.aarch64.neon.vcvthighdf2fp(<2 x float>, <2 x double>) nounwind readnone
-
-declare <2 x float> @llvm.aarch64.neon.fcvtxn.v2f32.v2f64(<2 x double>) nounwind readnone
-
define i16 @to_half(float %in) {
-; GENERIC-LABEL: to_half:
-; GENERIC: // %bb.0:
-; GENERIC-NEXT: fcvt h0, s0
-; GENERIC-NEXT: fmov w0, s0
-; GENERIC-NEXT: ret
+; CHECK-SD-LABEL: to_half:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: fcvt h0, s0
+; CHECK-SD-NEXT: fmov w0, s0
+; CHECK-SD-NEXT: ret
;
-; FAST-LABEL: to_half:
-; FAST: // %bb.0:
-; FAST-NEXT: fcvt h1, s0
-; FAST-NEXT: // implicit-def: $w0
-; FAST-NEXT: fmov s0, w0
-; FAST-NEXT: fmov s0, s1
-; FAST-NEXT: fmov w0, s0
-; FAST-NEXT: // kill: def $w1 killed $w0
-; FAST-NEXT: ret
+; CHECK-FI-LABEL: to_half:
+; CHECK-FI: // %bb.0:
+; CHECK-FI-NEXT: fcvt h1, s0
+; CHECK-FI-NEXT: // implicit-def: $w0
+; CHECK-FI-NEXT: fmov s0, w0
+; CHECK-FI-NEXT: fmov s0, s1
+; CHECK-FI-NEXT: fmov w0, s0
+; CHECK-FI-NEXT: // kill: def $w1 killed $w0
+; CHECK-FI-NEXT: ret
;
-; GISEL-LABEL: to_half:
-; GISEL: // %bb.0:
-; GISEL-NEXT: fcvt h0, s0
-; GISEL-NEXT: fmov w0, s0
-; GISEL-NEXT: ret
+; CHECK-GI-LABEL: to_half:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: fmov w0, s0
+; CHECK-GI-NEXT: ret
%res = call i16 @llvm.convert.to.fp16.f32(float %in)
ret i16 %res
}
define float @from_half(i16 %in) {
-; GENERIC-LABEL: from_half:
-; GENERIC: // %bb.0:
-; GENERIC-NEXT: fmov s0, w0
-; GENERIC-NEXT: fcvt s0, h0
-; GENERIC-NEXT: ret
+; CHECK-SD-LABEL: from_half:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: fmov s0, w0
+; CHECK-SD-NEXT: fcvt s0, h0
+; CHECK-SD-NEXT: ret
;
-; FAST-LABEL: from_half:
-; FAST: // %bb.0:
-; FAST-NEXT: fmov s0, w0
-; FAST-NEXT: // kill: def $h0 killed $h0 killed $s0
-; FAST-NEXT: fcvt s0, h0
-; FAST-NEXT: ret
+; CHECK-FI-LABEL: from_half:
+; CHECK-FI: // %bb.0:
+; CHECK-FI-NEXT: fmov s0, w0
+; CHECK-FI-NEXT: // kill: def $h0 killed $h0 killed $s0
+; CHECK-FI-NEXT: fcvt s0, h0
+; CHECK-FI-NEXT: ret
;
-; GISEL-LABEL: from_half:
-; GISEL: // %bb.0:
-; GISEL-NEXT: fmov s0, w0
-; GISEL-NEXT: fcvt s0, h0
-; GISEL-NEXT: ret
+; CHECK-GI-LABEL: from_half:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: fmov s0, w0
+; CHECK-GI-NEXT: fcvt s0, h0
+; CHECK-GI-NEXT: ret
%res = call float @llvm.convert.from.fp16.f32(i16 %in)
ret float %res
}
-
-declare float @llvm.convert.from.fp16.f32(i16) #1
-declare i16 @llvm.convert.to.fp16.f32(float) #1
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; FALLBACK: {{.*}}
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
index 5ae0839..3dfa6df 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-chained.ll
@@ -1361,132 +1361,6 @@ for.body: ; preds = %for.body.preheader,
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body, !loop !1
}
-define i32 @red_extended_add_incomplete_chain(ptr %start, ptr %end, i32 %offset) {
-; CHECK-NEON-LABEL: define i32 @red_extended_add_incomplete_chain(
-; CHECK-NEON-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-NEON-NEXT: entry:
-; CHECK-NEON-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
-; CHECK-NEON-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64
-; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1
-; CHECK-NEON-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
-; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
-; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-NEON: vector.ph:
-; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16
-; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
-; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
-; CHECK-NEON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[OFFSET]], i64 0
-; CHECK-NEON-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEON-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-NEON: vector.body:
-; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP4:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEON-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
-; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-NEON-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
-; CHECK-NEON-NEXT: [[PARTIAL_REDUCE:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
-; CHECK-NEON-NEXT: [[TMP4]] = add <16 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
-; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; CHECK-NEON-NEXT: [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEON-NEXT: br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK-NEON: middle.block:
-; CHECK-NEON-NEXT: [[TMP6:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP4]])
-; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
-; CHECK-NEON-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-NEON: scalar.ph:
-;
-; CHECK-SVE-LABEL: define i32 @red_extended_add_incomplete_chain(
-; CHECK-SVE-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-SVE-NEXT: entry:
-; CHECK-SVE-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
-; CHECK-SVE-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64
-; CHECK-SVE-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1
-; CHECK-SVE-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
-; CHECK-SVE-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 2
-; CHECK-SVE-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
-; CHECK-SVE-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-SVE: vector.ph:
-; CHECK-SVE-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 4
-; CHECK-SVE-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP5]]
-; CHECK-SVE-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
-; CHECK-SVE-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
-; CHECK-SVE-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 4 x i32> poison, i32 [[OFFSET]], i64 0
-; CHECK-SVE-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 4 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer
-; CHECK-SVE-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-SVE: vector.body:
-; CHECK-SVE-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 4 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP9:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
-; CHECK-SVE-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 4 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-SVE-NEXT: [[TMP7:%.*]] = zext <vscale x 4 x i8> [[WIDE_LOAD]] to <vscale x 4 x i32>
-; CHECK-SVE-NEXT: [[TMP8:%.*]] = add <vscale x 4 x i32> [[VEC_PHI]], [[TMP7]]
-; CHECK-SVE-NEXT: [[TMP9]] = add <vscale x 4 x i32> [[TMP8]], [[BROADCAST_SPLAT]]
-; CHECK-SVE-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-SVE-NEXT: [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-SVE-NEXT: br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK-SVE: middle.block:
-; CHECK-SVE-NEXT: [[TMP11:%.*]] = call i32 @llvm.vector.reduce.add.nxv4i32(<vscale x 4 x i32> [[TMP9]])
-; CHECK-SVE-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
-; CHECK-SVE-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-SVE: scalar.ph:
-;
-; CHECK-SVE-MAXBW-LABEL: define i32 @red_extended_add_incomplete_chain(
-; CHECK-SVE-MAXBW-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR1:[0-9]+]] {
-; CHECK-SVE-MAXBW-NEXT: entry:
-; CHECK-SVE-MAXBW-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
-; CHECK-SVE-MAXBW-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64
-; CHECK-SVE-MAXBW-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1
-; CHECK-SVE-MAXBW-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
-; CHECK-SVE-MAXBW-NEXT: [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT: [[TMP3:%.*]] = shl nuw i64 [[TMP2]], 3
-; CHECK-SVE-MAXBW-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
-; CHECK-SVE-MAXBW-NEXT: br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
-; CHECK-SVE-MAXBW: vector.ph:
-; CHECK-SVE-MAXBW-NEXT: [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-SVE-MAXBW-NEXT: [[TMP5:%.*]] = mul nuw i64 [[TMP4]], 8
-; CHECK-SVE-MAXBW-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], [[TMP5]]
-; CHECK-SVE-MAXBW-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
-; CHECK-SVE-MAXBW-NEXT: [[TMP6:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
-; CHECK-SVE-MAXBW-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i32> poison, i32 [[OFFSET]], i64 0
-; CHECK-SVE-MAXBW-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer
-; CHECK-SVE-MAXBW-NEXT: br label [[VECTOR_BODY:%.*]]
-; CHECK-SVE-MAXBW: vector.body:
-; CHECK-SVE-MAXBW-NEXT: [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-MAXBW-NEXT: [[VEC_PHI:%.*]] = phi <vscale x 8 x i32> [ zeroinitializer, [[VECTOR_PH]] ], [ [[TMP8:%.*]], [[VECTOR_BODY]] ]
-; CHECK-SVE-MAXBW-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
-; CHECK-SVE-MAXBW-NEXT: [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[NEXT_GEP]], align 1
-; CHECK-SVE-MAXBW-NEXT: [[TMP7:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i32>
-; CHECK-SVE-MAXBW-NEXT: [[PARTIAL_REDUCE:%.*]] = add <vscale x 8 x i32> [[VEC_PHI]], [[TMP7]]
-; CHECK-SVE-MAXBW-NEXT: [[TMP8]] = add <vscale x 8 x i32> [[PARTIAL_REDUCE]], [[BROADCAST_SPLAT]]
-; CHECK-SVE-MAXBW-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
-; CHECK-SVE-MAXBW-NEXT: [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-SVE-MAXBW-NEXT: br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
-; CHECK-SVE-MAXBW: middle.block:
-; CHECK-SVE-MAXBW-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.nxv8i32(<vscale x 8 x i32> [[TMP8]])
-; CHECK-SVE-MAXBW-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
-; CHECK-SVE-MAXBW-NEXT: br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK-SVE-MAXBW: scalar.ph:
-;
-entry:
- br label %loop
-
-loop:
- %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
- %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
- %l = load i8, ptr %ptr.iv, align 1
- %l.ext = zext i8 %l to i32
- %add = add i32 %red, %l.ext
- %red.next = add i32 %add, %offset
- %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
- %ec = icmp eq ptr %ptr.iv, %end
- br i1 %ec, label %exit, label %loop
-
-exit:
- ret i32 %red.next
-}
-
attributes #0 = { vscale_range(1,16) }
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll
new file mode 100644
index 0000000..d80178fd
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/partial-reduce-incomplete-chains.ll
@@ -0,0 +1,72 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals none --version 6
+; RUN: opt --mattr=+neon,+dotprod -passes=loop-vectorize -force-vector-interleave=1 -enable-epilogue-vectorization=false -S %s | FileCheck %s --check-prefixes=CHECK-NEON
+
+target triple = "arm64-apple-macosx"
+
+define i32 @red_extended_add_incomplete_chain(ptr %start, ptr %end, i32 %offset) {
+; CHECK-NEON-LABEL: define i32 @red_extended_add_incomplete_chain(
+; CHECK-NEON-SAME: ptr [[START:%.*]], ptr [[END:%.*]], i32 [[OFFSET:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEON-NEXT: [[ENTRY:.*]]:
+; CHECK-NEON-NEXT: [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-NEON-NEXT: [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-NEON-NEXT: [[TMP0:%.*]] = add i64 [[END1]], 1
+; CHECK-NEON-NEXT: [[TMP1:%.*]] = sub i64 [[TMP0]], [[START2]]
+; CHECK-NEON-NEXT: [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP1]], 16
+; CHECK-NEON-NEXT: br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK-NEON: [[VECTOR_PH]]:
+; CHECK-NEON-NEXT: [[N_MOD_VF:%.*]] = urem i64 [[TMP1]], 16
+; CHECK-NEON-NEXT: [[N_VEC:%.*]] = sub i64 [[TMP1]], [[N_MOD_VF]]
+; CHECK-NEON-NEXT: [[TMP2:%.*]] = getelementptr i8, ptr [[START]], i64 [[N_VEC]]
+; CHECK-NEON-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i32> poison, i32 [[OFFSET]], i64 0
+; CHECK-NEON-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i32> [[BROADCAST_SPLATINSERT]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEON-NEXT: br label %[[VECTOR_BODY:.*]]
+; CHECK-NEON: [[VECTOR_BODY]]:
+; CHECK-NEON-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT: [[VEC_PHI:%.*]] = phi <16 x i32> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEON-NEXT: [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[INDEX]]
+; CHECK-NEON-NEXT: [[WIDE_LOAD:%.*]] = load <16 x i8>, ptr [[NEXT_GEP]], align 1
+; CHECK-NEON-NEXT: [[TMP3:%.*]] = zext <16 x i8> [[WIDE_LOAD]] to <16 x i32>
+; CHECK-NEON-NEXT: [[TMP4:%.*]] = add <16 x i32> [[VEC_PHI]], [[TMP3]]
+; CHECK-NEON-NEXT: [[TMP5]] = add <16 x i32> [[TMP4]], [[BROADCAST_SPLAT]]
+; CHECK-NEON-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
+; CHECK-NEON-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEON-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; CHECK-NEON: [[MIDDLE_BLOCK]]:
+; CHECK-NEON-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v16i32(<16 x i32> [[TMP5]])
+; CHECK-NEON-NEXT: [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
+; CHECK-NEON-NEXT: br i1 [[CMP_N]], label %[[EXIT:.*]], label %[[SCALAR_PH]]
+; CHECK-NEON: [[SCALAR_PH]]:
+; CHECK-NEON-NEXT: [[BC_RESUME_VAL:%.*]] = phi ptr [ [[TMP2]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
+; CHECK-NEON-NEXT: [[BC_MERGE_RDX:%.*]] = phi i32 [ [[TMP7]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEON-NEXT: br label %[[LOOP:.*]]
+; CHECK-NEON: [[LOOP]]:
+; CHECK-NEON-NEXT: [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[GEP_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEON-NEXT: [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEON-NEXT: [[L:%.*]] = load i8, ptr [[PTR_IV]], align 1
+; CHECK-NEON-NEXT: [[L_EXT:%.*]] = zext i8 [[L]] to i32
+; CHECK-NEON-NEXT: [[ADD:%.*]] = add i32 [[RED]], [[L_EXT]]
+; CHECK-NEON-NEXT: [[RED_NEXT]] = add i32 [[ADD]], [[OFFSET]]
+; CHECK-NEON-NEXT: [[GEP_IV_NEXT]] = getelementptr i8, ptr [[PTR_IV]], i64 1
+; CHECK-NEON-NEXT: [[EC:%.*]] = icmp eq ptr [[PTR_IV]], [[END]]
+; CHECK-NEON-NEXT: br i1 [[EC]], label %[[EXIT]], label %[[LOOP]], !llvm.loop [[LOOP3:![0-9]+]]
+; CHECK-NEON: [[EXIT]]:
+; CHECK-NEON-NEXT: [[RED_NEXT_LCSSA:%.*]] = phi i32 [ [[RED_NEXT]], %[[LOOP]] ], [ [[TMP7]], %[[MIDDLE_BLOCK]] ]
+; CHECK-NEON-NEXT: ret i32 [[RED_NEXT_LCSSA]]
+;
+entry:
+ br label %loop
+
+loop:
+ %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ]
+ %red = phi i32 [ 0, %entry ], [ %red.next, %loop ]
+ %l = load i8, ptr %ptr.iv, align 1
+ %l.ext = zext i8 %l to i32
+ %add = add i32 %red, %l.ext
+ %red.next = add i32 %add, %offset
+ %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1
+ %ec = icmp eq ptr %ptr.iv, %end
+ br i1 %ec, label %exit, label %loop
+
+exit:
+ ret i32 %red.next
+}