aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AArch64
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AArch64')
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll6
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir30
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-disjoint-mask.mir19
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-undef-rhs.mir20
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir102
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/regbank-fp-use-def.mir2
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll489
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-smull.ll14
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-cvt-simd-fptoi.ll1943
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-neon-copy.ll57
-rw-r--r--llvm/test/CodeGen/AArch64/arm64-vcvt.ll30
-rw-r--r--llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll178
-rw-r--r--llvm/test/CodeGen/AArch64/dup.ll12
-rw-r--r--llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll21
-rw-r--r--llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll21
-rw-r--r--llvm/test/CodeGen/AArch64/ldst-prepost-uses.ll73
-rw-r--r--llvm/test/CodeGen/AArch64/load-zext-bitcast.ll80
-rw-r--r--llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll109
-rw-r--r--llvm/test/CodeGen/AArch64/sme-za-exceptions.ll2
19 files changed, 2893 insertions, 315 deletions
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
index 0f75887..c68ae92 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/arm64-irtranslator.ll
@@ -1734,8 +1734,7 @@ define float @test_different_call_conv_target(float %x) {
define <2 x i32> @test_shufflevector_s32_v2s32(i32 %arg) {
; CHECK-LABEL: name: test_shufflevector_s32_v2s32
; CHECK: [[ARG:%[0-9]+]]:_(s32) = COPY $w0
-; CHECK-DAG: [[UNDEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
-; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_SHUFFLE_VECTOR [[ARG]](s32), [[UNDEF]], shufflemask(0, 0)
+; CHECK: [[VEC:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[ARG]](s32), [[ARG]](s32)
; CHECK: $d0 = COPY [[VEC]](<2 x s32>)
%vec = insertelement <1 x i32> undef, i32 %arg, i32 0
%res = shufflevector <1 x i32> %vec, <1 x i32> undef, <2 x i32> zeroinitializer
@@ -1745,7 +1744,8 @@ define <2 x i32> @test_shufflevector_s32_v2s32(i32 %arg) {
define i32 @test_shufflevector_v2s32_s32(<2 x i32> %arg) {
; CHECK-LABEL: name: test_shufflevector_v2s32_s32
; CHECK: [[ARG:%[0-9]+]]:_(<2 x s32>) = COPY $d0
-; CHECK: [[RES:%[0-9]+]]:_(s32) = G_SHUFFLE_VECTOR [[ARG]](<2 x s32>), [[UNDEF]], shufflemask(1)
+; CHECK: [[IDX:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
+; CHECK: [[RES:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[ARG]](<2 x s32>), [[IDX]](s64)
; CHECK: $w0 = COPY [[RES]](s32)
%vec = shufflevector <2 x i32> %arg, <2 x i32> undef, <1 x i32> <i32 1>
%res = extractelement <1 x i32> %vec, i32 0
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
index 2e70252..fdd0ebb 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-shuffle-vector.mir
@@ -119,33 +119,6 @@ body: |
...
---
-name: shuffle_1elt_mask
-alignment: 4
-tracksRegLiveness: true
-body: |
- bb.1:
- liveins: $d0, $d1
-
- ; CHECK-LABEL: name: shuffle_1elt_mask
- ; CHECK: liveins: $d0, $d1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $d0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $d1
- ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[COPY]](s64)
- ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[COPY1]](s64)
- ; CHECK-NEXT: $d0 = COPY [[COPY2]](s64)
- ; CHECK-NEXT: $d1 = COPY [[COPY3]](s64)
- ; CHECK-NEXT: RET_ReallyLR implicit $d0, implicit $d1
- %0:_(s64) = COPY $d0
- %1:_(s64) = COPY $d1
- %3:_(s64) = G_SHUFFLE_VECTOR %0:_(s64), %1:_, shufflemask(0)
- %4:_(s64) = G_SHUFFLE_VECTOR %0:_(s64), %1:_, shufflemask(1)
- $d0 = COPY %3(s64)
- $d1 = COPY %4(s64)
- RET_ReallyLR implicit $d0, implicit $d1
-
-...
----
name: oversize_shuffle_v4i64
alignment: 4
tracksRegLiveness: true
@@ -641,7 +614,8 @@ body: |
%0:_(<8 x s1>) = G_TRUNC %2:_(<8 x s8>)
%3:_(<8 x s8>) = COPY $d1
%1:_(<8 x s1>) = G_TRUNC %3:_(<8 x s8>)
- %4:_(s1) = G_SHUFFLE_VECTOR %0:_(<8 x s1>), %1:_, shufflemask(12)
+ %7:_(s64) = G_CONSTANT i64 4
+ %4:_(s1) = G_EXTRACT_VECTOR_ELT %1:_(<8 x s1>), %7(s64)
%5:_(s8) = G_ZEXT %4:_(s1)
%6:_(s32) = G_ANYEXT %5:_(s8)
$w0 = COPY %6:_(s32)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-disjoint-mask.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-disjoint-mask.mir
index 9261d7a..914dfa0 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-disjoint-mask.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-disjoint-mask.mir
@@ -80,22 +80,3 @@ body: |
%2:_(<4 x s32>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(-1,0,1,-1)
RET_ReallyLR implicit %2
...
-
----
-name: shuffle_vector_scalar
-tracksRegLiveness: true
-body: |
- bb.1:
- liveins: $x0, $x1
-
- ; CHECK-LABEL: name: shuffle_vector_scalar
- ; CHECK: liveins: $x0, $x1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[COPY]](s64), [[COPY]](s64), [[COPY]](s64)
- ; CHECK-NEXT: RET_ReallyLR implicit [[BUILD_VECTOR]](<4 x s64>)
- %0:_(s64) = COPY $x0
- %1:_(s64) = COPY $x1
- %2:_(<4 x s64>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(0, 0, 0, 0)
- RET_ReallyLR implicit %2
-...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-undef-rhs.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-undef-rhs.mir
index 9bf7993..7a314ba 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-undef-rhs.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector-undef-rhs.mir
@@ -20,23 +20,3 @@ body: |
%2:_(<4 x s32>) = G_SHUFFLE_VECTOR %0(<2 x s32>), %1(<2 x s32>), shufflemask(0, 2, 1, 3)
RET_ReallyLR implicit %2
...
-
----
-name: shuffle_vector_undef_rhs_scalar
-tracksRegLiveness: true
-body: |
- bb.1:
- liveins: $x0
-
- ; CHECK-LABEL: name: shuffle_vector_undef_rhs_scalar
- ; CHECK: liveins: $x0
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[DEF]](s64)
- ; CHECK-NEXT: RET_ReallyLR implicit [[BUILD_VECTOR]](<2 x s64>)
- %0:_(s64) = COPY $x0
- %1:_(s64) = G_IMPLICIT_DEF
- %2:_(<2 x s64>) = G_SHUFFLE_VECTOR %0(s64), %1(s64), shufflemask(0, 1)
- RET_ReallyLR implicit %2
-...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir
index 2c9ae5b..9013410 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/prelegalizercombiner-shuffle-vector.mir
@@ -386,7 +386,7 @@ body: |
; CHECK-NEXT: RET_ReallyLR implicit [[BUILD_VECTOR]](<4 x p0>)
%0:_(p0) = COPY $x0
%1:_(p0) = COPY $x1
- %6:_(<4 x p0>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(0,1,0,1)
+ %6:_(<4 x p0>) = G_BUILD_VECTOR %0, %1, %0, %1
RET_ReallyLR implicit %6
...
@@ -408,104 +408,6 @@ body: |
; CHECK-NEXT: RET_ReallyLR implicit [[BUILD_VECTOR]](<2 x p0>)
%0:_(p0) = COPY $x0
%1:_(p0) = COPY $x1
- %6:_(<2 x p0>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(1,0)
- RET_ReallyLR implicit %6
-...
-
-# Check that we properly use undef values when shuffle_vector
-# on scalars gets lowered to build_vector.
----
-name: shuffle_vector_on_scalars_to_build_vector_with_undef
-tracksRegLiveness: true
-body: |
- bb.1:
- liveins: $x0, $x1
-
- ; CHECK-LABEL: name: shuffle_vector_on_scalars_to_build_vector_with_undef
- ; CHECK: liveins: $x0, $x1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s64) = COPY $x0
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $x1
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s64) = G_IMPLICIT_DEF
- ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s64>) = G_BUILD_VECTOR [[COPY]](s64), [[DEF]](s64), [[DEF]](s64), [[COPY1]](s64)
- ; CHECK-NEXT: RET_ReallyLR implicit [[BUILD_VECTOR]](<4 x s64>)
- %0:_(s64) = COPY $x0
- %1:_(s64) = COPY $x1
- %6:_(<4 x s64>) = G_SHUFFLE_VECTOR %0, %1, shufflemask(0,-1,-1,1)
- RET_ReallyLR implicit %6
-...
-
-# Check that shuffle_vector on scalars gets combined into a plain
-# copy when the resulting type is a scalar as well and the sizes
-# are compatible.
----
-name: shuffle_vector_on_scalars_to_copy_ptr
-tracksRegLiveness: true
-body: |
- bb.1:
- liveins: $x0
-
- ; CHECK-LABEL: name: shuffle_vector_on_scalars_to_copy_ptr
- ; CHECK: liveins: $x0
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(p0) = COPY $x0
- ; CHECK-NEXT: RET_ReallyLR implicit [[COPY]](p0)
- %0:_(p0) = COPY $x0
- %6:_(p0) = G_SHUFFLE_VECTOR %0, %0, shufflemask(0)
- RET_ReallyLR implicit %6
-...
----
-name: shuffle_vector_to_copy_lhs
-tracksRegLiveness: true
-body: |
- bb.1:
- liveins: $x0, $x1
-
- ; CHECK-LABEL: name: shuffle_vector_to_copy_lhs
- ; CHECK: liveins: $x0, $x1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $x0
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 1
- ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s32>), [[C]](s64)
- ; CHECK-NEXT: RET_ReallyLR implicit [[EVEC]](s32)
- %0:_(<2 x s32>) = COPY $x0
- %1:_(<2 x s32>) = COPY $x1
- %6:_(s32) = G_SHUFFLE_VECTOR %0, %1, shufflemask(1)
- RET_ReallyLR implicit %6
-...
----
-name: shuffle_vector_to_copy_rhs
-tracksRegLiveness: true
-body: |
- bb.1:
- liveins: $x0, $x1
-
- ; CHECK-LABEL: name: shuffle_vector_to_copy_rhs
- ; CHECK: liveins: $x0, $x1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[COPY:%[0-9]+]]:_(<2 x s32>) = COPY $x1
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s64) = G_CONSTANT i64 0
- ; CHECK-NEXT: [[EVEC:%[0-9]+]]:_(s32) = G_EXTRACT_VECTOR_ELT [[COPY]](<2 x s32>), [[C]](s64)
- ; CHECK-NEXT: RET_ReallyLR implicit [[EVEC]](s32)
- %0:_(<2 x s32>) = COPY $x0
- %1:_(<2 x s32>) = COPY $x1
- %6:_(s32) = G_SHUFFLE_VECTOR %0, %1, shufflemask(2)
- RET_ReallyLR implicit %6
-...
----
-name: shuffle_vector_to_copy_undef
-tracksRegLiveness: true
-body: |
- bb.1:
- liveins: $x0, $x1
-
- ; CHECK-LABEL: name: shuffle_vector_to_copy_undef
- ; CHECK: liveins: $x0, $x1
- ; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
- ; CHECK-NEXT: RET_ReallyLR implicit [[DEF]](s32)
- %0:_(<2 x s32>) = COPY $x0
- %1:_(<2 x s32>) = COPY $x1
- %6:_(s32) = G_SHUFFLE_VECTOR %0, %1, shufflemask(-1)
+ %6:_(<2 x p0>) = G_BUILD_VECTOR %1, %0
RET_ReallyLR implicit %6
...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-fp-use-def.mir b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-fp-use-def.mir
index b252884..46dbc15 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/regbank-fp-use-def.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/regbank-fp-use-def.mir
@@ -96,7 +96,7 @@ body: |
; CHECK-NEXT: [[SITOFP:%[0-9]+]]:fpr(s32) = G_SITOFP [[COPY1]](s32)
; CHECK-NEXT: [[COPY3:%[0-9]+]]:fpr(s32) = COPY [[COPY2]](s32)
; CHECK-NEXT: [[SELECT:%[0-9]+]]:fpr(s32) = G_SELECT [[COPY2]](s32), [[COPY3]], [[SITOFP]]
- ; CHECK-NEXT: [[FPTOSI:%[0-9]+]]:gpr(s32) = G_FPTOSI [[SELECT]](s32)
+ ; CHECK-NEXT: [[FPTOSI:%[0-9]+]]:fpr(s32) = G_FPTOSI [[SELECT]](s32)
%0:_(s32) = COPY $w0
%2:_(s32) = COPY $w1
%3:_(s32) = COPY $w2
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 0933e67..b54f262 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -749,12 +749,429 @@ for.body: ; preds = %for.body.preheader1
br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
}
+define i64 @red_mla_dup_ext_u8_s8_s64(ptr noalias noundef readonly captures(none) %A, i8 noundef %B, i32 noundef %n) {
+; CHECK-SD-LABEL: red_mla_dup_ext_u8_s8_s64:
+; CHECK-SD: // %bb.0: // %entry
+; CHECK-SD-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-SD-NEXT: cbz w2, .LBB6_3
+; CHECK-SD-NEXT: // %bb.1: // %iter.check
+; CHECK-SD-NEXT: str x25, [sp, #-64]! // 8-byte Folded Spill
+; CHECK-SD-NEXT: stp x24, x23, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT: stp x22, x21, [sp, #32] // 16-byte Folded Spill
+; CHECK-SD-NEXT: stp x20, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT: .cfi_offset w19, -8
+; CHECK-SD-NEXT: .cfi_offset w20, -16
+; CHECK-SD-NEXT: .cfi_offset w21, -24
+; CHECK-SD-NEXT: .cfi_offset w22, -32
+; CHECK-SD-NEXT: .cfi_offset w23, -40
+; CHECK-SD-NEXT: .cfi_offset w24, -48
+; CHECK-SD-NEXT: .cfi_offset w25, -64
+; CHECK-SD-NEXT: sxtb x9, w1
+; CHECK-SD-NEXT: cmp w2, #3
+; CHECK-SD-NEXT: mov w10, w2
+; CHECK-SD-NEXT: b.hi .LBB6_4
+; CHECK-SD-NEXT: // %bb.2:
+; CHECK-SD-NEXT: mov x11, xzr
+; CHECK-SD-NEXT: mov x8, xzr
+; CHECK-SD-NEXT: b .LBB6_13
+; CHECK-SD-NEXT: .LBB6_3:
+; CHECK-SD-NEXT: mov x0, xzr
+; CHECK-SD-NEXT: ret
+; CHECK-SD-NEXT: .LBB6_4: // %vector.main.loop.iter.check
+; CHECK-SD-NEXT: dup v0.2d, x9
+; CHECK-SD-NEXT: cmp w2, #16
+; CHECK-SD-NEXT: b.hs .LBB6_6
+; CHECK-SD-NEXT: // %bb.5:
+; CHECK-SD-NEXT: mov x11, xzr
+; CHECK-SD-NEXT: mov x8, xzr
+; CHECK-SD-NEXT: b .LBB6_10
+; CHECK-SD-NEXT: .LBB6_6: // %vector.ph
+; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
+; CHECK-SD-NEXT: mov x8, v0.d[1]
+; CHECK-SD-NEXT: and x12, x10, #0xc
+; CHECK-SD-NEXT: movi v2.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v4.2d, #0000000000000000
+; CHECK-SD-NEXT: and x11, x10, #0xfffffff0
+; CHECK-SD-NEXT: movi v3.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v7.2d, #0000000000000000
+; CHECK-SD-NEXT: mov x15, x0
+; CHECK-SD-NEXT: movi v5.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v16.2d, #0000000000000000
+; CHECK-SD-NEXT: and x16, x10, #0xfffffff0
+; CHECK-SD-NEXT: movi v6.2d, #0000000000000000
+; CHECK-SD-NEXT: fmov x13, d0
+; CHECK-SD-NEXT: fmov x14, d0
+; CHECK-SD-NEXT: .LBB6_7: // %vector.body
+; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT: ldr q17, [x15], #16
+; CHECK-SD-NEXT: subs x16, x16, #16
+; CHECK-SD-NEXT: ushll v18.8h, v17.8b, #0
+; CHECK-SD-NEXT: ushll2 v19.8h, v17.16b, #0
+; CHECK-SD-NEXT: ushll v17.4s, v18.4h, #0
+; CHECK-SD-NEXT: ushll2 v20.4s, v19.8h, #0
+; CHECK-SD-NEXT: ushll2 v18.4s, v18.8h, #0
+; CHECK-SD-NEXT: ushll v19.4s, v19.4h, #0
+; CHECK-SD-NEXT: ushll v21.2d, v17.2s, #0
+; CHECK-SD-NEXT: ushll2 v22.2d, v20.4s, #0
+; CHECK-SD-NEXT: ushll2 v17.2d, v17.4s, #0
+; CHECK-SD-NEXT: ushll v23.2d, v18.2s, #0
+; CHECK-SD-NEXT: ushll v20.2d, v20.2s, #0
+; CHECK-SD-NEXT: ushll2 v18.2d, v18.4s, #0
+; CHECK-SD-NEXT: fmov x17, d21
+; CHECK-SD-NEXT: mov x2, v21.d[1]
+; CHECK-SD-NEXT: ushll v21.2d, v19.2s, #0
+; CHECK-SD-NEXT: ushll2 v19.2d, v19.4s, #0
+; CHECK-SD-NEXT: fmov x18, d22
+; CHECK-SD-NEXT: fmov x1, d17
+; CHECK-SD-NEXT: fmov x3, d23
+; CHECK-SD-NEXT: fmov x21, d20
+; CHECK-SD-NEXT: fmov x22, d18
+; CHECK-SD-NEXT: fmov x19, d21
+; CHECK-SD-NEXT: mul x17, x13, x17
+; CHECK-SD-NEXT: mov x4, v22.d[1]
+; CHECK-SD-NEXT: fmov x24, d19
+; CHECK-SD-NEXT: mov x5, v23.d[1]
+; CHECK-SD-NEXT: mov x6, v21.d[1]
+; CHECK-SD-NEXT: mov x7, v20.d[1]
+; CHECK-SD-NEXT: mov x20, v18.d[1]
+; CHECK-SD-NEXT: mov x23, v19.d[1]
+; CHECK-SD-NEXT: mov x25, v17.d[1]
+; CHECK-SD-NEXT: mul x18, x14, x18
+; CHECK-SD-NEXT: mul x1, x13, x1
+; CHECK-SD-NEXT: fmov d17, x17
+; CHECK-SD-NEXT: mul x3, x13, x3
+; CHECK-SD-NEXT: fmov d18, x18
+; CHECK-SD-NEXT: mul x19, x13, x19
+; CHECK-SD-NEXT: fmov d19, x1
+; CHECK-SD-NEXT: mul x21, x13, x21
+; CHECK-SD-NEXT: fmov d20, x3
+; CHECK-SD-NEXT: mul x22, x13, x22
+; CHECK-SD-NEXT: fmov d21, x19
+; CHECK-SD-NEXT: mul x24, x13, x24
+; CHECK-SD-NEXT: fmov d24, x21
+; CHECK-SD-NEXT: mul x2, x8, x2
+; CHECK-SD-NEXT: fmov d22, x22
+; CHECK-SD-NEXT: mul x4, x8, x4
+; CHECK-SD-NEXT: fmov d23, x24
+; CHECK-SD-NEXT: mul x5, x8, x5
+; CHECK-SD-NEXT: mov v17.d[1], x2
+; CHECK-SD-NEXT: mul x6, x8, x6
+; CHECK-SD-NEXT: mov v18.d[1], x4
+; CHECK-SD-NEXT: mul x7, x8, x7
+; CHECK-SD-NEXT: mov v20.d[1], x5
+; CHECK-SD-NEXT: add v1.2d, v17.2d, v1.2d
+; CHECK-SD-NEXT: mul x20, x8, x20
+; CHECK-SD-NEXT: mov v21.d[1], x6
+; CHECK-SD-NEXT: add v6.2d, v18.2d, v6.2d
+; CHECK-SD-NEXT: mul x23, x8, x23
+; CHECK-SD-NEXT: mov v24.d[1], x7
+; CHECK-SD-NEXT: add v4.2d, v20.2d, v4.2d
+; CHECK-SD-NEXT: mul x17, x8, x25
+; CHECK-SD-NEXT: mov v22.d[1], x20
+; CHECK-SD-NEXT: add v7.2d, v21.2d, v7.2d
+; CHECK-SD-NEXT: mov v23.d[1], x23
+; CHECK-SD-NEXT: add v16.2d, v24.2d, v16.2d
+; CHECK-SD-NEXT: mov v19.d[1], x17
+; CHECK-SD-NEXT: add v3.2d, v22.2d, v3.2d
+; CHECK-SD-NEXT: add v5.2d, v23.2d, v5.2d
+; CHECK-SD-NEXT: add v2.2d, v19.2d, v2.2d
+; CHECK-SD-NEXT: b.ne .LBB6_7
+; CHECK-SD-NEXT: // %bb.8: // %middle.block
+; CHECK-SD-NEXT: add v1.2d, v1.2d, v7.2d
+; CHECK-SD-NEXT: add v4.2d, v4.2d, v16.2d
+; CHECK-SD-NEXT: cmp x11, x10
+; CHECK-SD-NEXT: add v2.2d, v2.2d, v5.2d
+; CHECK-SD-NEXT: add v3.2d, v3.2d, v6.2d
+; CHECK-SD-NEXT: add v1.2d, v1.2d, v4.2d
+; CHECK-SD-NEXT: add v2.2d, v2.2d, v3.2d
+; CHECK-SD-NEXT: add v1.2d, v1.2d, v2.2d
+; CHECK-SD-NEXT: addp d1, v1.2d
+; CHECK-SD-NEXT: fmov x8, d1
+; CHECK-SD-NEXT: b.eq .LBB6_15
+; CHECK-SD-NEXT: // %bb.9: // %vec.epilog.iter.check
+; CHECK-SD-NEXT: cbz x12, .LBB6_13
+; CHECK-SD-NEXT: .LBB6_10: // %vec.epilog.ph
+; CHECK-SD-NEXT: movi v1.2d, #0000000000000000
+; CHECK-SD-NEXT: movi v2.2d, #0000000000000000
+; CHECK-SD-NEXT: mov x13, x11
+; CHECK-SD-NEXT: movi v3.2d, #0x000000000000ff
+; CHECK-SD-NEXT: fmov x14, d0
+; CHECK-SD-NEXT: and x11, x10, #0xfffffffc
+; CHECK-SD-NEXT: fmov x15, d0
+; CHECK-SD-NEXT: sub x12, x13, x11
+; CHECK-SD-NEXT: add x13, x0, x13
+; CHECK-SD-NEXT: mov v1.d[0], x8
+; CHECK-SD-NEXT: mov x8, v0.d[1]
+; CHECK-SD-NEXT: .LBB6_11: // %vec.epilog.vector.body
+; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT: ldr s0, [x13], #4
+; CHECK-SD-NEXT: adds x12, x12, #4
+; CHECK-SD-NEXT: ushll v0.8h, v0.8b, #0
+; CHECK-SD-NEXT: ushll v0.4s, v0.4h, #0
+; CHECK-SD-NEXT: ushll v4.2d, v0.2s, #0
+; CHECK-SD-NEXT: ushll2 v0.2d, v0.4s, #0
+; CHECK-SD-NEXT: and v4.16b, v4.16b, v3.16b
+; CHECK-SD-NEXT: and v0.16b, v0.16b, v3.16b
+; CHECK-SD-NEXT: fmov x16, d4
+; CHECK-SD-NEXT: fmov x18, d0
+; CHECK-SD-NEXT: mov x17, v4.d[1]
+; CHECK-SD-NEXT: mov x1, v0.d[1]
+; CHECK-SD-NEXT: mul x16, x14, x16
+; CHECK-SD-NEXT: mul x18, x15, x18
+; CHECK-SD-NEXT: mul x17, x8, x17
+; CHECK-SD-NEXT: fmov d0, x16
+; CHECK-SD-NEXT: mul x1, x8, x1
+; CHECK-SD-NEXT: fmov d4, x18
+; CHECK-SD-NEXT: mov v0.d[1], x17
+; CHECK-SD-NEXT: mov v4.d[1], x1
+; CHECK-SD-NEXT: add v1.2d, v0.2d, v1.2d
+; CHECK-SD-NEXT: add v2.2d, v4.2d, v2.2d
+; CHECK-SD-NEXT: b.ne .LBB6_11
+; CHECK-SD-NEXT: // %bb.12: // %vec.epilog.middle.block
+; CHECK-SD-NEXT: add v0.2d, v1.2d, v2.2d
+; CHECK-SD-NEXT: cmp x11, x10
+; CHECK-SD-NEXT: addp d0, v0.2d
+; CHECK-SD-NEXT: fmov x8, d0
+; CHECK-SD-NEXT: b.eq .LBB6_15
+; CHECK-SD-NEXT: .LBB6_13: // %for.body.preheader
+; CHECK-SD-NEXT: sub x10, x10, x11
+; CHECK-SD-NEXT: add x11, x0, x11
+; CHECK-SD-NEXT: .LBB6_14: // %for.body
+; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-SD-NEXT: ldrb w12, [x11], #1
+; CHECK-SD-NEXT: subs x10, x10, #1
+; CHECK-SD-NEXT: smaddl x8, w12, w9, x8
+; CHECK-SD-NEXT: b.ne .LBB6_14
+; CHECK-SD-NEXT: .LBB6_15:
+; CHECK-SD-NEXT: ldp x20, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT: ldp x22, x21, [sp, #32] // 16-byte Folded Reload
+; CHECK-SD-NEXT: ldp x24, x23, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT: ldr x25, [sp], #64 // 8-byte Folded Reload
+; CHECK-SD-NEXT: mov x0, x8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: red_mla_dup_ext_u8_s8_s64:
+; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: // kill: def $w1 killed $w1 def $x1
+; CHECK-GI-NEXT: cbz w2, .LBB6_7
+; CHECK-GI-NEXT: // %bb.1: // %iter.check
+; CHECK-GI-NEXT: movi d0, #0000000000000000
+; CHECK-GI-NEXT: sxtb x9, w1
+; CHECK-GI-NEXT: mov x11, xzr
+; CHECK-GI-NEXT: cmp w2, #4
+; CHECK-GI-NEXT: mov w10, w2
+; CHECK-GI-NEXT: b.lo .LBB6_12
+; CHECK-GI-NEXT: // %bb.2: // %vector.main.loop.iter.check
+; CHECK-GI-NEXT: movi d0, #0000000000000000
+; CHECK-GI-NEXT: dup v1.2d, x9
+; CHECK-GI-NEXT: mov x11, xzr
+; CHECK-GI-NEXT: cmp w2, #16
+; CHECK-GI-NEXT: b.lo .LBB6_9
+; CHECK-GI-NEXT: // %bb.3: // %vector.ph
+; CHECK-GI-NEXT: movi v0.2d, #0000000000000000
+; CHECK-GI-NEXT: xtn v2.2s, v1.2d
+; CHECK-GI-NEXT: and x8, x10, #0xc
+; CHECK-GI-NEXT: movi v3.2d, #0000000000000000
+; CHECK-GI-NEXT: movi v4.2d, #0000000000000000
+; CHECK-GI-NEXT: and x11, x10, #0xfffffff0
+; CHECK-GI-NEXT: movi v5.2d, #0000000000000000
+; CHECK-GI-NEXT: movi v6.2d, #0000000000000000
+; CHECK-GI-NEXT: mov x12, x0
+; CHECK-GI-NEXT: movi v7.2d, #0000000000000000
+; CHECK-GI-NEXT: movi v16.2d, #0000000000000000
+; CHECK-GI-NEXT: and x13, x10, #0xfffffff0
+; CHECK-GI-NEXT: movi v17.2d, #0000000000000000
+; CHECK-GI-NEXT: .LBB6_4: // %vector.body
+; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT: ldr q18, [x12], #16
+; CHECK-GI-NEXT: subs x13, x13, #16
+; CHECK-GI-NEXT: ushll v19.8h, v18.8b, #0
+; CHECK-GI-NEXT: ushll2 v18.8h, v18.16b, #0
+; CHECK-GI-NEXT: ushll v20.4s, v19.4h, #0
+; CHECK-GI-NEXT: ushll2 v19.4s, v19.8h, #0
+; CHECK-GI-NEXT: ushll v21.4s, v18.4h, #0
+; CHECK-GI-NEXT: ushll2 v18.4s, v18.8h, #0
+; CHECK-GI-NEXT: mov d22, v20.d[1]
+; CHECK-GI-NEXT: mov d23, v19.d[1]
+; CHECK-GI-NEXT: mov d24, v21.d[1]
+; CHECK-GI-NEXT: mov d25, v18.d[1]
+; CHECK-GI-NEXT: smlal v0.2d, v2.2s, v20.2s
+; CHECK-GI-NEXT: smlal v4.2d, v2.2s, v19.2s
+; CHECK-GI-NEXT: smlal v6.2d, v2.2s, v21.2s
+; CHECK-GI-NEXT: smlal v16.2d, v2.2s, v18.2s
+; CHECK-GI-NEXT: smlal v3.2d, v2.2s, v22.2s
+; CHECK-GI-NEXT: smlal v5.2d, v2.2s, v23.2s
+; CHECK-GI-NEXT: smlal v7.2d, v2.2s, v24.2s
+; CHECK-GI-NEXT: smlal v17.2d, v2.2s, v25.2s
+; CHECK-GI-NEXT: b.ne .LBB6_4
+; CHECK-GI-NEXT: // %bb.5: // %middle.block
+; CHECK-GI-NEXT: add v0.2d, v0.2d, v3.2d
+; CHECK-GI-NEXT: add v2.2d, v4.2d, v5.2d
+; CHECK-GI-NEXT: cmp x11, x10
+; CHECK-GI-NEXT: add v3.2d, v6.2d, v7.2d
+; CHECK-GI-NEXT: add v4.2d, v16.2d, v17.2d
+; CHECK-GI-NEXT: add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT: add v2.2d, v3.2d, v4.2d
+; CHECK-GI-NEXT: add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT: addp d0, v0.2d
+; CHECK-GI-NEXT: b.ne .LBB6_8
+; CHECK-GI-NEXT: // %bb.6:
+; CHECK-GI-NEXT: fmov x8, d0
+; CHECK-GI-NEXT: mov x0, x8
+; CHECK-GI-NEXT: ret
+; CHECK-GI-NEXT: .LBB6_7:
+; CHECK-GI-NEXT: mov x8, xzr
+; CHECK-GI-NEXT: mov x0, x8
+; CHECK-GI-NEXT: ret
+; CHECK-GI-NEXT: .LBB6_8: // %vec.epilog.iter.check
+; CHECK-GI-NEXT: cbz x8, .LBB6_12
+; CHECK-GI-NEXT: .LBB6_9: // %vec.epilog.ph
+; CHECK-GI-NEXT: mov v0.d[1], xzr
+; CHECK-GI-NEXT: movi v2.2d, #0000000000000000
+; CHECK-GI-NEXT: mov x12, x11
+; CHECK-GI-NEXT: xtn v1.2s, v1.2d
+; CHECK-GI-NEXT: and x11, x10, #0xfffffffc
+; CHECK-GI-NEXT: sub x8, x12, x11
+; CHECK-GI-NEXT: add x12, x0, x12
+; CHECK-GI-NEXT: .LBB6_10: // %vec.epilog.vector.body
+; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT: ldr w13, [x12], #4
+; CHECK-GI-NEXT: adds x8, x8, #4
+; CHECK-GI-NEXT: fmov s3, w13
+; CHECK-GI-NEXT: uxtb w13, w13
+; CHECK-GI-NEXT: mov b4, v3.b[2]
+; CHECK-GI-NEXT: mov b5, v3.b[1]
+; CHECK-GI-NEXT: mov b6, v3.b[3]
+; CHECK-GI-NEXT: fmov s3, w13
+; CHECK-GI-NEXT: fmov w14, s4
+; CHECK-GI-NEXT: fmov w15, s5
+; CHECK-GI-NEXT: fmov w16, s6
+; CHECK-GI-NEXT: uxtb w14, w14
+; CHECK-GI-NEXT: uxtb w15, w15
+; CHECK-GI-NEXT: uxtb w16, w16
+; CHECK-GI-NEXT: fmov s4, w14
+; CHECK-GI-NEXT: mov v3.s[1], w15
+; CHECK-GI-NEXT: mov v4.s[1], w16
+; CHECK-GI-NEXT: smlal v0.2d, v1.2s, v3.2s
+; CHECK-GI-NEXT: smlal v2.2d, v1.2s, v4.2s
+; CHECK-GI-NEXT: b.ne .LBB6_10
+; CHECK-GI-NEXT: // %bb.11: // %vec.epilog.middle.block
+; CHECK-GI-NEXT: add v0.2d, v0.2d, v2.2d
+; CHECK-GI-NEXT: cmp x11, x10
+; CHECK-GI-NEXT: addp d0, v0.2d
+; CHECK-GI-NEXT: fmov x8, d0
+; CHECK-GI-NEXT: b.eq .LBB6_14
+; CHECK-GI-NEXT: .LBB6_12: // %for.body.preheader
+; CHECK-GI-NEXT: sub x10, x10, x11
+; CHECK-GI-NEXT: add x11, x0, x11
+; CHECK-GI-NEXT: .LBB6_13: // %for.body
+; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
+; CHECK-GI-NEXT: ldrb w8, [x11], #1
+; CHECK-GI-NEXT: fmov x12, d0
+; CHECK-GI-NEXT: subs x10, x10, #1
+; CHECK-GI-NEXT: madd x8, x8, x9, x12
+; CHECK-GI-NEXT: fmov d0, x8
+; CHECK-GI-NEXT: b.ne .LBB6_13
+; CHECK-GI-NEXT: .LBB6_14: // %for.cond.cleanup
+; CHECK-GI-NEXT: mov x0, x8
+; CHECK-GI-NEXT: ret
+entry:
+ %cmp5.not = icmp eq i32 %n, 0
+ br i1 %cmp5.not, label %for.cond.cleanup, label %iter.check
+
+iter.check: ; preds = %entry
+ %conv1 = sext i8 %B to i64
+ %wide.trip.count = zext i32 %n to i64
+ %min.iters.check = icmp ult i32 %n, 4
+ br i1 %min.iters.check, label %for.body.preheader, label %vector.main.loop.iter.check
+
+vector.main.loop.iter.check: ; preds = %iter.check
+ %min.iters.check9 = icmp ult i32 %n, 16
+ br i1 %min.iters.check9, label %vec.epilog.ph, label %vector.ph
+
+vector.ph: ; preds = %vector.main.loop.iter.check
+ %n.mod.vf = and i64 %wide.trip.count, 12
+ %n.vec = and i64 %wide.trip.count, 4294967280
+ %broadcast.splatinsert = insertelement <16 x i64> poison, i64 %conv1, i64 0
+ %broadcast.splat = shufflevector <16 x i64> %broadcast.splatinsert, <16 x i64> poison, <16 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %vector.ph
+ %index = phi i64 [ 0, %vector.ph ], [ %index.next, %vector.body ]
+ %vec.phi = phi <16 x i64> [ zeroinitializer, %vector.ph ], [ %3, %vector.body ]
+ %0 = getelementptr inbounds nuw i8, ptr %A, i64 %index
+ %wide.load = load <16 x i8>, ptr %0, align 1
+ %1 = zext <16 x i8> %wide.load to <16 x i64>
+ %2 = mul nsw <16 x i64> %broadcast.splat, %1
+ %3 = add <16 x i64> %2, %vec.phi
+ %index.next = add nuw i64 %index, 16
+ %4 = icmp eq i64 %index.next, %n.vec
+ br i1 %4, label %middle.block, label %vector.body
+
+middle.block: ; preds = %vector.body
+ %5 = tail call i64 @llvm.vector.reduce.add.v16i64(<16 x i64> %3)
+ %cmp.n = icmp eq i64 %n.vec, %wide.trip.count
+ br i1 %cmp.n, label %for.cond.cleanup, label %vec.epilog.iter.check
+
+vec.epilog.iter.check: ; preds = %middle.block
+ %min.epilog.iters.check = icmp eq i64 %n.mod.vf, 0
+ br i1 %min.epilog.iters.check, label %for.body.preheader, label %vec.epilog.ph
+
+vec.epilog.ph: ; preds = %vector.main.loop.iter.check, %vec.epilog.iter.check
+ %vec.epilog.resume.val = phi i64 [ %n.vec, %vec.epilog.iter.check ], [ 0, %vector.main.loop.iter.check ]
+ %bc.merge.rdx = phi i64 [ %5, %vec.epilog.iter.check ], [ 0, %vector.main.loop.iter.check ]
+ %n.vec11 = and i64 %wide.trip.count, 4294967292
+ %6 = insertelement <4 x i64> <i64 poison, i64 0, i64 0, i64 0>, i64 %bc.merge.rdx, i64 0
+ %broadcast.splatinsert12 = insertelement <4 x i64> poison, i64 %conv1, i64 0
+ %broadcast.splat13 = shufflevector <4 x i64> %broadcast.splatinsert12, <4 x i64> poison, <4 x i32> zeroinitializer
+ br label %vec.epilog.vector.body
+
+vec.epilog.vector.body: ; preds = %vec.epilog.vector.body, %vec.epilog.ph
+ %index14 = phi i64 [ %vec.epilog.resume.val, %vec.epilog.ph ], [ %index.next17, %vec.epilog.vector.body ]
+ %vec.phi15 = phi <4 x i64> [ %6, %vec.epilog.ph ], [ %10, %vec.epilog.vector.body ]
+ %7 = getelementptr inbounds nuw i8, ptr %A, i64 %index14
+ %wide.load16 = load <4 x i8>, ptr %7, align 1
+ %8 = zext <4 x i8> %wide.load16 to <4 x i64>
+ %9 = mul nsw <4 x i64> %broadcast.splat13, %8
+ %10 = add <4 x i64> %9, %vec.phi15
+ %index.next17 = add nuw i64 %index14, 4
+ %11 = icmp eq i64 %index.next17, %n.vec11
+ br i1 %11, label %vec.epilog.middle.block, label %vec.epilog.vector.body
+
+vec.epilog.middle.block: ; preds = %vec.epilog.vector.body
+ %12 = tail call i64 @llvm.vector.reduce.add.v4i64(<4 x i64> %10)
+ %cmp.n18 = icmp eq i64 %n.vec11, %wide.trip.count
+ br i1 %cmp.n18, label %for.cond.cleanup, label %for.body.preheader
+
+for.body.preheader: ; preds = %iter.check, %vec.epilog.iter.check, %vec.epilog.middle.block
+ %indvars.iv.ph = phi i64 [ 0, %iter.check ], [ %n.vec, %vec.epilog.iter.check ], [ %n.vec11, %vec.epilog.middle.block ]
+ %s.06.ph = phi i64 [ 0, %iter.check ], [ %5, %vec.epilog.iter.check ], [ %12, %vec.epilog.middle.block ]
+ br label %for.body
+
+for.cond.cleanup: ; preds = %for.body, %middle.block, %vec.epilog.middle.block, %entry
+ %s.0.lcssa = phi i64 [ 0, %entry ], [ %5, %middle.block ], [ %12, %vec.epilog.middle.block ], [ %add, %for.body ]
+ ret i64 %s.0.lcssa
+
+for.body: ; preds = %for.body.preheader, %for.body
+ %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ %indvars.iv.ph, %for.body.preheader ]
+ %s.06 = phi i64 [ %add, %for.body ], [ %s.06.ph, %for.body.preheader ]
+ %arrayidx = getelementptr inbounds nuw i8, ptr %A, i64 %indvars.iv
+ %13 = load i8, ptr %arrayidx, align 1
+ %conv = zext i8 %13 to i64
+ %mul = mul nsw i64 %conv, %conv1
+ %add = add nsw i64 %mul, %s.06
+ %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+ %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count
+ br i1 %exitcond.not, label %for.cond.cleanup, label %for.body
+}
+
define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
; CHECK-SD-LABEL: sink_v2z64_1:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: mov x8, xzr
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: .LBB6_1: // %loop
+; CHECK-SD-NEXT: .LBB7_1: // %loop
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-SD-NEXT: ldr d1, [x0]
; CHECK-SD-NEXT: subs x2, x2, #8
@@ -762,7 +1179,7 @@ define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
; CHECK-SD-NEXT: umull v1.2d, v1.2s, v0.s[1]
; CHECK-SD-NEXT: shrn v1.2s, v1.2d, #15
; CHECK-SD-NEXT: str d1, [x0], #32
-; CHECK-SD-NEXT: b.ne .LBB6_1
+; CHECK-SD-NEXT: b.ne .LBB7_1
; CHECK-SD-NEXT: // %bb.2: // %exit
; CHECK-SD-NEXT: ret
;
@@ -772,7 +1189,7 @@ define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
; CHECK-GI-NEXT: mov x8, xzr
; CHECK-GI-NEXT: dup v0.2d, v0.d[1]
; CHECK-GI-NEXT: xtn v0.2s, v0.2d
-; CHECK-GI-NEXT: .LBB6_1: // %loop
+; CHECK-GI-NEXT: .LBB7_1: // %loop
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-GI-NEXT: ldr d1, [x0]
; CHECK-GI-NEXT: subs x2, x2, #8
@@ -780,7 +1197,7 @@ define void @sink_v2z64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
; CHECK-GI-NEXT: umull v1.2d, v1.2s, v0.2s
; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #15
; CHECK-GI-NEXT: str d1, [x0], #32
-; CHECK-GI-NEXT: b.ne .LBB6_1
+; CHECK-GI-NEXT: b.ne .LBB7_1
; CHECK-GI-NEXT: // %bb.2: // %exit
; CHECK-GI-NEXT: ret
entry:
@@ -813,7 +1230,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: mov x8, xzr
; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-SD-NEXT: .LBB7_1: // %loop
+; CHECK-SD-NEXT: .LBB8_1: // %loop
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-SD-NEXT: ldr q1, [x0]
; CHECK-SD-NEXT: subs x2, x2, #8
@@ -823,7 +1240,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
; CHECK-SD-NEXT: shrn v2.2s, v2.2d, #15
; CHECK-SD-NEXT: shrn2 v2.4s, v1.2d, #15
; CHECK-SD-NEXT: str q2, [x0], #32
-; CHECK-SD-NEXT: b.ne .LBB7_1
+; CHECK-SD-NEXT: b.ne .LBB8_1
; CHECK-SD-NEXT: // %bb.2: // %exit
; CHECK-SD-NEXT: ret
;
@@ -833,7 +1250,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
; CHECK-GI-NEXT: mov x8, xzr
; CHECK-GI-NEXT: dup v0.2d, v0.d[1]
; CHECK-GI-NEXT: xtn v0.2s, v0.2d
-; CHECK-GI-NEXT: .LBB7_1: // %loop
+; CHECK-GI-NEXT: .LBB8_1: // %loop
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-GI-NEXT: ldr q1, [x0]
; CHECK-GI-NEXT: subs x2, x2, #8
@@ -844,7 +1261,7 @@ define void @sink_v4i64_1(ptr %p, ptr %d, i64 %n, <2 x i32> %a) {
; CHECK-GI-NEXT: shrn v1.2s, v1.2d, #15
; CHECK-GI-NEXT: shrn2 v1.4s, v2.2d, #15
; CHECK-GI-NEXT: str q1, [x0], #32
-; CHECK-GI-NEXT: b.ne .LBB7_1
+; CHECK-GI-NEXT: b.ne .LBB8_1
; CHECK-GI-NEXT: // %bb.2: // %exit
; CHECK-GI-NEXT: ret
entry:
@@ -877,7 +1294,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: dup v0.8b, v0.b[0]
; CHECK-SD-NEXT: mov x8, xzr
-; CHECK-SD-NEXT: .LBB8_1: // %loop
+; CHECK-SD-NEXT: .LBB9_1: // %loop
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-SD-NEXT: ldr d1, [x0]
; CHECK-SD-NEXT: subs x2, x2, #8
@@ -886,7 +1303,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
; CHECK-SD-NEXT: cmlt v1.8h, v1.8h, #0
; CHECK-SD-NEXT: xtn v1.8b, v1.8h
; CHECK-SD-NEXT: str d1, [x0], #32
-; CHECK-SD-NEXT: b.ne .LBB8_1
+; CHECK-SD-NEXT: b.ne .LBB9_1
; CHECK-SD-NEXT: // %bb.2: // %exit
; CHECK-SD-NEXT: ret
;
@@ -896,7 +1313,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
; CHECK-GI-NEXT: mov x8, xzr
; CHECK-GI-NEXT: dup v0.8h, v0.h[0]
; CHECK-GI-NEXT: xtn v0.8b, v0.8h
-; CHECK-GI-NEXT: .LBB8_1: // %loop
+; CHECK-GI-NEXT: .LBB9_1: // %loop
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-GI-NEXT: ldr d1, [x0]
; CHECK-GI-NEXT: subs x2, x2, #8
@@ -905,7 +1322,7 @@ define void @sink_v8z16_0(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
; CHECK-GI-NEXT: cmlt v1.8h, v1.8h, #0
; CHECK-GI-NEXT: xtn v1.8b, v1.8h
; CHECK-GI-NEXT: str d1, [x0], #32
-; CHECK-GI-NEXT: b.ne .LBB8_1
+; CHECK-GI-NEXT: b.ne .LBB9_1
; CHECK-GI-NEXT: // %bb.2: // %exit
; CHECK-GI-NEXT: ret
entry:
@@ -938,7 +1355,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: dup v0.16b, v0.b[10]
; CHECK-SD-NEXT: mov x8, xzr
-; CHECK-SD-NEXT: .LBB9_1: // %loop
+; CHECK-SD-NEXT: .LBB10_1: // %loop
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-SD-NEXT: ldr q1, [x0]
; CHECK-SD-NEXT: subs x2, x2, #8
@@ -949,7 +1366,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
; CHECK-SD-NEXT: cmlt v2.8h, v2.8h, #0
; CHECK-SD-NEXT: uzp1 v1.16b, v2.16b, v1.16b
; CHECK-SD-NEXT: str q1, [x0], #32
-; CHECK-SD-NEXT: b.ne .LBB9_1
+; CHECK-SD-NEXT: b.ne .LBB10_1
; CHECK-SD-NEXT: // %bb.2: // %exit
; CHECK-SD-NEXT: ret
;
@@ -959,7 +1376,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
; CHECK-GI-NEXT: mov x8, xzr
; CHECK-GI-NEXT: dup v0.8h, v0.h[2]
; CHECK-GI-NEXT: xtn v0.8b, v0.8h
-; CHECK-GI-NEXT: .LBB9_1: // %loop
+; CHECK-GI-NEXT: .LBB10_1: // %loop
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-GI-NEXT: ldr q1, [x0]
; CHECK-GI-NEXT: subs x2, x2, #8
@@ -971,7 +1388,7 @@ define void @sink_v16s16_8(ptr %p, ptr %d, i64 %n, <16 x i8> %a) {
; CHECK-GI-NEXT: cmlt v2.8h, v2.8h, #0
; CHECK-GI-NEXT: uzp1 v1.16b, v1.16b, v2.16b
; CHECK-GI-NEXT: str q1, [x0], #32
-; CHECK-GI-NEXT: b.ne .LBB9_1
+; CHECK-GI-NEXT: b.ne .LBB10_1
; CHECK-GI-NEXT: // %bb.2: // %exit
; CHECK-GI-NEXT: ret
entry:
@@ -1005,7 +1422,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
; CHECK-SD-NEXT: dup v0.4h, w3
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: and x8, x0, #0xfffffff8
-; CHECK-SD-NEXT: .LBB10_1: // %vector.body
+; CHECK-SD-NEXT: .LBB11_1: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1
; CHECK-SD-NEXT: subs x8, x8, #8
@@ -1015,7 +1432,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
; CHECK-SD-NEXT: umull v1.4s, v0.4h, v1.4h
; CHECK-SD-NEXT: umull v2.4s, v0.4h, v2.4h
; CHECK-SD-NEXT: stp q1, q2, [x9]
-; CHECK-SD-NEXT: b.ne .LBB10_1
+; CHECK-SD-NEXT: b.ne .LBB11_1
; CHECK-SD-NEXT: // %bb.2: // %for.end12
; CHECK-SD-NEXT: ret
;
@@ -1026,7 +1443,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
; CHECK-GI-NEXT: mov w8, w0
; CHECK-GI-NEXT: and x8, x8, #0xfffffff8
; CHECK-GI-NEXT: xtn v0.4h, v0.4s
-; CHECK-GI-NEXT: .LBB10_1: // %vector.body
+; CHECK-GI-NEXT: .LBB11_1: // %vector.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1
; CHECK-GI-NEXT: subs x8, x8, #8
@@ -1036,7 +1453,7 @@ define void @matrix_mul_unsigned_and(i32 %N, ptr nocapture %C, ptr nocapture rea
; CHECK-GI-NEXT: umull v1.4s, v0.4h, v1.4h
; CHECK-GI-NEXT: umull v2.4s, v0.4h, v2.4h
; CHECK-GI-NEXT: stp q1, q2, [x9]
-; CHECK-GI-NEXT: b.ne .LBB10_1
+; CHECK-GI-NEXT: b.ne .LBB11_1
; CHECK-GI-NEXT: // %bb.2: // %for.end12
; CHECK-GI-NEXT: ret
vector.header:
@@ -1089,7 +1506,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
; CHECK-SD-NEXT: dup v0.8h, w3
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: and x8, x0, #0xfffffff0
-; CHECK-SD-NEXT: .LBB11_1: // %vector.body
+; CHECK-SD-NEXT: .LBB12_1: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1
; CHECK-SD-NEXT: subs x8, x8, #16
@@ -1103,7 +1520,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
; CHECK-SD-NEXT: umull v2.4s, v0.4h, v2.4h
; CHECK-SD-NEXT: stp q1, q3, [x9]
; CHECK-SD-NEXT: stp q2, q4, [x9, #32]
-; CHECK-SD-NEXT: b.ne .LBB11_1
+; CHECK-SD-NEXT: b.ne .LBB12_1
; CHECK-SD-NEXT: // %bb.2: // %for.end12
; CHECK-SD-NEXT: ret
;
@@ -1114,7 +1531,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
; CHECK-GI-NEXT: mov w8, w0
; CHECK-GI-NEXT: and x8, x8, #0xfffffff0
; CHECK-GI-NEXT: xtn v0.4h, v0.4s
-; CHECK-GI-NEXT: .LBB11_1: // %vector.body
+; CHECK-GI-NEXT: .LBB12_1: // %vector.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1
; CHECK-GI-NEXT: subs x8, x8, #16
@@ -1130,7 +1547,7 @@ define void @matrix_mul_unsigned_and_double(i32 %N, ptr nocapture %C, ptr nocapt
; CHECK-GI-NEXT: umull v4.4s, v0.4h, v4.4h
; CHECK-GI-NEXT: stp q1, q3, [x9]
; CHECK-GI-NEXT: stp q2, q4, [x9, #32]!
-; CHECK-GI-NEXT: b.ne .LBB11_1
+; CHECK-GI-NEXT: b.ne .LBB12_1
; CHECK-GI-NEXT: // %bb.2: // %for.end12
; CHECK-GI-NEXT: ret
vector.header:
@@ -1184,7 +1601,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: and x8, x0, #0xfffffff8
; CHECK-SD-NEXT: fmov s0, w9
-; CHECK-SD-NEXT: .LBB12_1: // %vector.body
+; CHECK-SD-NEXT: .LBB13_1: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1
; CHECK-SD-NEXT: subs x8, x8, #8
@@ -1196,7 +1613,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
; CHECK-SD-NEXT: mul v1.4s, v1.4s, v0.s[0]
; CHECK-SD-NEXT: mul v2.4s, v2.4s, v0.s[0]
; CHECK-SD-NEXT: stp q1, q2, [x9]
-; CHECK-SD-NEXT: b.ne .LBB12_1
+; CHECK-SD-NEXT: b.ne .LBB13_1
; CHECK-SD-NEXT: // %bb.2: // %for.end12
; CHECK-SD-NEXT: ret
;
@@ -1206,7 +1623,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
; CHECK-GI-NEXT: dup v0.4s, w8
; CHECK-GI-NEXT: mov w8, w0
; CHECK-GI-NEXT: and x8, x8, #0xfffffff8
-; CHECK-GI-NEXT: .LBB12_1: // %vector.body
+; CHECK-GI-NEXT: .LBB13_1: // %vector.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1
; CHECK-GI-NEXT: subs x8, x8, #8
@@ -1218,7 +1635,7 @@ define void @matrix_mul_signed_and(i32 %N, ptr nocapture %C, ptr nocapture reado
; CHECK-GI-NEXT: mul v1.4s, v0.4s, v1.4s
; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s
; CHECK-GI-NEXT: stp q1, q2, [x9]
-; CHECK-GI-NEXT: b.ne .LBB12_1
+; CHECK-GI-NEXT: b.ne .LBB13_1
; CHECK-GI-NEXT: // %bb.2: // %for.end12
; CHECK-GI-NEXT: ret
vector.header:
@@ -1272,7 +1689,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur
; CHECK-SD-NEXT: // kill: def $w0 killed $w0 def $x0
; CHECK-SD-NEXT: and x8, x0, #0xfffffff0
; CHECK-SD-NEXT: fmov s0, w9
-; CHECK-SD-NEXT: .LBB13_1: // %vector.body
+; CHECK-SD-NEXT: .LBB14_1: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-SD-NEXT: add x9, x2, w0, uxtw #1
; CHECK-SD-NEXT: subs x8, x8, #16
@@ -1290,7 +1707,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur
; CHECK-SD-NEXT: mul v2.4s, v2.4s, v0.s[0]
; CHECK-SD-NEXT: stp q1, q3, [x9]
; CHECK-SD-NEXT: stp q2, q4, [x9, #32]
-; CHECK-SD-NEXT: b.ne .LBB13_1
+; CHECK-SD-NEXT: b.ne .LBB14_1
; CHECK-SD-NEXT: // %bb.2: // %for.end12
; CHECK-SD-NEXT: ret
;
@@ -1300,7 +1717,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur
; CHECK-GI-NEXT: dup v0.4s, w8
; CHECK-GI-NEXT: mov w8, w0
; CHECK-GI-NEXT: and x8, x8, #0xfffffff0
-; CHECK-GI-NEXT: .LBB13_1: // %vector.body
+; CHECK-GI-NEXT: .LBB14_1: // %vector.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-GI-NEXT: add x9, x2, w0, uxtw #1
; CHECK-GI-NEXT: subs x8, x8, #16
@@ -1318,7 +1735,7 @@ define void @matrix_mul_signed_and_double(i32 %N, ptr nocapture %C, ptr nocaptur
; CHECK-GI-NEXT: mul v2.4s, v0.4s, v2.4s
; CHECK-GI-NEXT: stp q3, q1, [x9]
; CHECK-GI-NEXT: stp q4, q2, [x9, #32]!
-; CHECK-GI-NEXT: b.ne .LBB13_1
+; CHECK-GI-NEXT: b.ne .LBB14_1
; CHECK-GI-NEXT: // %bb.2: // %for.end12
; CHECK-GI-NEXT: ret
vector.header:
@@ -1369,9 +1786,9 @@ define noundef <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %sc
; CHECK-SD-LABEL: cmplx_mul_combined_re_im:
; CHECK-SD: // %bb.0: // %entry
; CHECK-SD-NEXT: lsr x9, x0, #16
-; CHECK-SD-NEXT: adrp x8, .LCPI14_0
+; CHECK-SD-NEXT: adrp x8, .LCPI15_0
; CHECK-SD-NEXT: dup v4.8h, w0
-; CHECK-SD-NEXT: ldr q3, [x8, :lo12:.LCPI14_0]
+; CHECK-SD-NEXT: ldr q3, [x8, :lo12:.LCPI15_0]
; CHECK-SD-NEXT: dup v2.8h, w9
; CHECK-SD-NEXT: sqneg v1.8h, v2.8h
; CHECK-SD-NEXT: tbl v1.16b, { v1.16b, v2.16b }, v3.16b
@@ -1386,12 +1803,12 @@ define noundef <8 x i16> @cmplx_mul_combined_re_im(<8 x i16> noundef %a, i64 %sc
; CHECK-GI-LABEL: cmplx_mul_combined_re_im:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: lsr w9, w0, #16
-; CHECK-GI-NEXT: adrp x8, .LCPI14_0
+; CHECK-GI-NEXT: adrp x8, .LCPI15_0
; CHECK-GI-NEXT: rev32 v4.8h, v0.8h
; CHECK-GI-NEXT: dup v1.8h, w9
; CHECK-GI-NEXT: fmov s3, w9
; CHECK-GI-NEXT: sqneg v2.8h, v1.8h
-; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI14_0]
+; CHECK-GI-NEXT: ldr q1, [x8, :lo12:.LCPI15_0]
; CHECK-GI-NEXT: tbl v1.16b, { v2.16b, v3.16b }, v1.16b
; CHECK-GI-NEXT: mov d2, v0.d[1]
; CHECK-GI-NEXT: dup v3.8h, w0
diff --git a/llvm/test/CodeGen/AArch64/aarch64-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
index 6e5c666..0cd885e 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-smull.ll
@@ -222,22 +222,20 @@ define <4 x i32> @smull_zext_v4i16_v4i32(ptr %A, ptr %B) nounwind {
define <2 x i64> @smull_zext_v2i32_v2i64(ptr %A, ptr %B) nounwind {
; CHECK-NEON-LABEL: smull_zext_v2i32_v2i64:
; CHECK-NEON: // %bb.0:
-; CHECK-NEON-NEXT: ldrh w8, [x0]
-; CHECK-NEON-NEXT: ldrh w9, [x0, #2]
+; CHECK-NEON-NEXT: ldrh w8, [x0, #2]
+; CHECK-NEON-NEXT: ldr h0, [x0]
; CHECK-NEON-NEXT: ldr d1, [x1]
-; CHECK-NEON-NEXT: fmov d0, x8
-; CHECK-NEON-NEXT: mov v0.d[1], x9
+; CHECK-NEON-NEXT: mov v0.d[1], x8
; CHECK-NEON-NEXT: xtn v0.2s, v0.2d
; CHECK-NEON-NEXT: smull v0.2d, v0.2s, v1.2s
; CHECK-NEON-NEXT: ret
;
; CHECK-SVE-LABEL: smull_zext_v2i32_v2i64:
; CHECK-SVE: // %bb.0:
-; CHECK-SVE-NEXT: ldrh w8, [x0]
-; CHECK-SVE-NEXT: ldrh w9, [x0, #2]
+; CHECK-SVE-NEXT: ldrh w8, [x0, #2]
+; CHECK-SVE-NEXT: ldr h0, [x0]
; CHECK-SVE-NEXT: ldr d1, [x1]
-; CHECK-SVE-NEXT: fmov d0, x8
-; CHECK-SVE-NEXT: mov v0.d[1], x9
+; CHECK-SVE-NEXT: mov v0.d[1], x8
; CHECK-SVE-NEXT: xtn v0.2s, v0.2d
; CHECK-SVE-NEXT: smull v0.2d, v0.2s, v1.2s
; CHECK-SVE-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/arm64-cvt-simd-fptoi.ll b/llvm/test/CodeGen/AArch64/arm64-cvt-simd-fptoi.ll
new file mode 100644
index 0000000..a729772
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/arm64-cvt-simd-fptoi.ll
@@ -0,0 +1,1943 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+fullfp16 | FileCheck %s --check-prefixes=CHECK-NOFPRCVT
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -mattr=+fprcvt,+fullfp16 | FileCheck %s --check-prefixes=CHECK
+; RUN: llc < %s -mtriple aarch64-unknown-unknown -global-isel -global-isel-abort=2 -mattr=+fprcvt,+fullfp16 2>&1 | FileCheck %s --check-prefixes=CHECK
+
+; CHECK-GI: warning: Instruction selection used fallback path for fptosi_i32_f16_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptosi_i64_f16_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptosi_i64_f32_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptosi_i32_f64_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptosi_i64_f64_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptosi_i32_f32_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i32_f16_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i64_f16_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i64_f32_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i32_f64_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i64_f64_simd
+; CHECK-GI-NEXT: warning: Instruction selection used fallback path for fptoui_i32_f32_simd
+
+;
+; FPTOI
+;
+
+define float @test_fptosi_f16_i32_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: test_fptosi_f16_i32_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: test_fptosi_f16_i32_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, h0
+; CHECK-NEXT: ret
+ %r = fptosi half %a to i32
+ %bc = bitcast i32 %r to float
+ ret float %bc
+}
+
+define double @test_fptosi_f16_i64_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: test_fptosi_f16_i64_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: test_fptosi_f16_i64_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, h0
+; CHECK-NEXT: ret
+ %r = fptosi half %a to i64
+ %bc = bitcast i64 %r to double
+ ret double %bc
+}
+
+define float @test_fptosi_f64_i32_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: test_fptosi_f64_i32_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: test_fptosi_f64_i32_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, d0
+; CHECK-NEXT: ret
+ %r = fptosi double %a to i32
+ %bc = bitcast i32 %r to float
+ ret float %bc
+}
+
+define double @test_fptosi_f32_i64_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: test_fptosi_f32_i64_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: test_fptosi_f32_i64_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, s0
+; CHECK-NEXT: ret
+ %r = fptosi float %a to i64
+ %bc = bitcast i64 %r to double
+ ret double %bc
+}
+
+define double @test_fptosi_f64_i64_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: test_fptosi_f64_i64_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: test_fptosi_f64_i64_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: ret
+ %r = fptosi double %a to i64
+ %bc = bitcast i64 %r to double
+ ret double %bc
+}
+
+
+define float @test_fptosi_f32_i32_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: test_fptosi_f32_i32_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: test_fptosi_f32_i32_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: ret
+ %r = fptosi float %a to i32
+ %bc = bitcast i32 %r to float
+ ret float %bc
+}
+
+define float @test_fptoui_f16_i32_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: test_fptoui_f16_i32_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: test_fptoui_f16_i32_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu s0, h0
+; CHECK-NEXT: ret
+ %r = fptoui half %a to i32
+ %bc = bitcast i32 %r to float
+ ret float %bc
+}
+
+define double @test_fptoui_f16_i64_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: test_fptoui_f16_i64_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: test_fptoui_f16_i64_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, h0
+; CHECK-NEXT: ret
+ %r = fptoui half %a to i64
+ %bc = bitcast i64 %r to double
+ ret double %bc
+}
+
+define float @test_fptoui_f64_i32_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: test_fptoui_f64_i32_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: test_fptoui_f64_i32_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu s0, d0
+; CHECK-NEXT: ret
+ %r = fptoui double %a to i32
+ %bc = bitcast i32 %r to float
+ ret float %bc
+}
+
+define double @test_fptoui_f32_i64_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: test_fptoui_f32_i64_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: test_fptoui_f32_i64_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, s0
+; CHECK-NEXT: ret
+ %r = fptoui float %a to i64
+ %bc = bitcast i64 %r to double
+ ret double %bc
+}
+
+define double @test_fptoui_f64_i64_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: test_fptoui_f64_i64_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: test_fptoui_f64_i64_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, d0
+; CHECK-NEXT: ret
+ %r = fptoui double %a to i64
+ %bc = bitcast i64 %r to double
+ ret double %bc
+}
+
+
+define float @test_fptoui_f32_i32_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: test_fptoui_f32_i32_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: test_fptoui_f32_i32_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: ret
+ %r = fptoui float %a to i32
+ %bc = bitcast i32 %r to float
+ ret float %bc
+}
+
+
+;
+; FPTOI strictfp
+;
+
+define float @fptosi_i32_f16_simd(half %x) {
+; CHECK-NOFPRCVT-LABEL: fptosi_i32_f16_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fptosi_i32_f16_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, h0
+; CHECK-NEXT: ret
+ %val = call i32 @llvm.experimental.constrained.fptosi.i32.f16(half %x, metadata !"fpexcept.strict")
+ %sum = bitcast i32 %val to float
+ ret float %sum
+}
+
+define double @fptosi_i64_f16_simd(half %x) {
+; CHECK-NOFPRCVT-LABEL: fptosi_i64_f16_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fptosi_i64_f16_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, h0
+; CHECK-NEXT: ret
+ %val = call i64 @llvm.experimental.constrained.fptosi.i64.f16(half %x, metadata !"fpexcept.strict")
+ %sum = bitcast i64 %val to double
+ ret double %sum
+}
+
+define double @fptosi_i64_f32_simd(float %x) {
+; CHECK-NOFPRCVT-LABEL: fptosi_i64_f32_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fptosi_i64_f32_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, s0
+; CHECK-NEXT: ret
+ %val = call i64 @llvm.experimental.constrained.fptosi.i64.f32(float %x, metadata !"fpexcept.strict")
+ %bc = bitcast i64 %val to double
+ ret double %bc
+}
+
+define float @fptosi_i32_f64_simd(double %x) {
+; CHECK-NOFPRCVT-LABEL: fptosi_i32_f64_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fptosi_i32_f64_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, d0
+; CHECK-NEXT: ret
+ %val = call i32 @llvm.experimental.constrained.fptosi.i32.f64(double %x, metadata !"fpexcept.strict")
+ %bc = bitcast i32 %val to float
+ ret float %bc
+}
+
+define double @fptosi_i64_f64_simd(double %x) {
+; CHECK-NOFPRCVT-LABEL: fptosi_i64_f64_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fptosi_i64_f64_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: ret
+ %val = call i64 @llvm.experimental.constrained.fptosi.i64.f64(double %x, metadata !"fpexcept.strict")
+ %bc = bitcast i64 %val to double
+ ret double %bc
+}
+
+define float @fptosi_i32_f32_simd(float %x) {
+; CHECK-NOFPRCVT-LABEL: fptosi_i32_f32_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fptosi_i32_f32_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: ret
+ %val = call i32 @llvm.experimental.constrained.fptosi.i32.f32(float %x, metadata !"fpexcept.strict")
+ %bc = bitcast i32 %val to float
+ ret float %bc
+}
+
+
+
+define float @fptoui_i32_f16_simd(half %x) {
+; CHECK-NOFPRCVT-LABEL: fptoui_i32_f16_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fptoui_i32_f16_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu s0, h0
+; CHECK-NEXT: ret
+ %val = call i32 @llvm.experimental.constrained.fptoui.i32.f16(half %x, metadata !"fpexcept.strict")
+ %sum = bitcast i32 %val to float
+ ret float %sum
+}
+
+define double @fptoui_i64_f16_simd(half %x) {
+; CHECK-NOFPRCVT-LABEL: fptoui_i64_f16_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fptoui_i64_f16_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, h0
+; CHECK-NEXT: ret
+ %val = call i64 @llvm.experimental.constrained.fptoui.i64.f16(half %x, metadata !"fpexcept.strict")
+ %sum = bitcast i64 %val to double
+ ret double %sum
+}
+
+define double @fptoui_i64_f32_simd(float %x) {
+; CHECK-NOFPRCVT-LABEL: fptoui_i64_f32_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fptoui_i64_f32_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, s0
+; CHECK-NEXT: ret
+ %val = call i64 @llvm.experimental.constrained.fptoui.i64.f32(float %x, metadata !"fpexcept.strict")
+ %bc = bitcast i64 %val to double
+ ret double %bc
+}
+
+define float @fptoui_i32_f64_simd(double %x) {
+; CHECK-NOFPRCVT-LABEL: fptoui_i32_f64_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fptoui_i32_f64_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu s0, d0
+; CHECK-NEXT: ret
+ %val = call i32 @llvm.experimental.constrained.fptoui.i32.f64(double %x, metadata !"fpexcept.strict")
+ %bc = bitcast i32 %val to float
+ ret float %bc
+}
+
+define double @fptoui_i64_f64_simd(double %x) {
+; CHECK-NOFPRCVT-LABEL: fptoui_i64_f64_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fptoui_i64_f64_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, d0
+; CHECK-NEXT: ret
+ %val = call i64 @llvm.experimental.constrained.fptoui.i64.f64(double %x, metadata !"fpexcept.strict")
+ %bc = bitcast i64 %val to double
+ ret double %bc
+}
+
+define float @fptoui_i32_f32_simd(float %x) {
+; CHECK-NOFPRCVT-LABEL: fptoui_i32_f32_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fptoui_i32_f32_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: ret
+ %val = call i32 @llvm.experimental.constrained.fptoui.i32.f32(float %x, metadata !"fpexcept.strict")
+ %bc = bitcast i32 %val to float
+ ret float %bc
+}
+
+;
+; FPTOI rounding
+;
+
+
+define double @fcvtas_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_ds_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtas_ds_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.round.f32(float %a)
+ %i = fptosi float %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtas_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_sd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtas_sd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.round.f64(double %a)
+ %i = fptosi double %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtas_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_ss_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtas_ss_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.round.f32(float %a)
+ %i = fptosi float %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtas_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_dd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtas_dd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.round.f64(double %a)
+ %i = fptosi double %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+
+define double @fcvtau_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_ds_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtau x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtau_ds_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtau d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.round.f32(float %a)
+ %i = fptoui float %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtau_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_sd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtau w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtau_sd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtau s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.round.f64(double %a)
+ %i = fptoui double %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtau_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_ss_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtau_ss_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.round.f32(float %a)
+ %i = fptosi float %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtau_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_dd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtau_dd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.round.f64(double %a)
+ %i = fptosi double %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+
+define double @fcvtms_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_ds_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtms_ds_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.floor.f32(float %a)
+ %i = fptosi float %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtms_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_sd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtms_sd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.floor.f64(double %a)
+ %i = fptosi double %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtms_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_ss_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtms_ss_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.floor.f32(float %a)
+ %i = fptosi float %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtms_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_dd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtms_dd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.floor.f64(double %a)
+ %i = fptosi double %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+
+
+define double @fcvtmu_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_ds_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtmu x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtmu_ds_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtmu d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.floor.f32(float %a)
+ %i = fptoui float %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtmu_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_sd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtmu w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtmu_sd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtmu s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.floor.f64(double %a)
+ %i = fptoui double %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtmu_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_ss_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtmu_ss_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.floor.f32(float %a)
+ %i = fptosi float %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtmu_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_dd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtmu_dd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.floor.f64(double %a)
+ %i = fptosi double %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+
+define double @fcvtps_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_ds_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtps_ds_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.ceil.f32(float %a)
+ %i = fptosi float %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtps_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_sd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtps_sd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.ceil.f64(double %a)
+ %i = fptosi double %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtps_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_ss_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtps_ss_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.ceil.f32(float %a)
+ %i = fptosi float %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtps_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_dd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtps_dd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.ceil.f64(double %a)
+ %i = fptosi double %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+
+define double @fcvtpu_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_ds_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtpu x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtpu_ds_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtpu d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.ceil.f32(float %a)
+ %i = fptoui float %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtpu_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_sd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtpu w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtpu_sd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtpu s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.ceil.f64(double %a)
+ %i = fptoui double %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtpu_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_ss_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtpu_ss_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.ceil.f32(float %a)
+ %i = fptosi float %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtpu_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_dd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtpu_dd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.ceil.f64(double %a)
+ %i = fptosi double %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+
+define double @fcvtzs_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_ds_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_ds_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.trunc.f32(float %a)
+ %i = fptosi float %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtzs_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_sd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_sd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.trunc.f64(double %a)
+ %i = fptosi double %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtzs_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_ss_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_ss_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.trunc.f32(float %a)
+ %i = fptosi float %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtzs_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_dd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_dd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.trunc.f64(double %a)
+ %i = fptosi double %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define double @fcvtzu_ds_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_ds_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_ds_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.trunc.f32(float %a)
+ %i = fptoui float %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtzu_sd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_sd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_sd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.trunc.f64(double %a)
+ %i = fptoui double %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtzu_ss_round_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_ss_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_ss_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.trunc.f32(float %a)
+ %i = fptosi float %r to i32
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtzu_dd_round_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_dd_round_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_dd_round_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.trunc.f64(double %a)
+ %i = fptosi double %r to i64
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+
+;
+; FPTOI saturating
+;
+
+define float @fcvtzs_sh_sat_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_sh_sat_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_sh_sat_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, h0
+; CHECK-NEXT: ret
+ %i = call i32 @llvm.fptosi.sat.i32.f16(half %a)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtzs_dh_sat_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_dh_sat_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_dh_sat_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, h0
+; CHECK-NEXT: ret
+ %i = call i64 @llvm.fptosi.sat.i64.f16(half %a)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define double @fcvtzs_ds_sat_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_ds_sat_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_ds_sat_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, s0
+; CHECK-NEXT: ret
+ %i = call i64 @llvm.fptosi.sat.i64.f32(float %a)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtzs_sd_sat_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_sd_sat_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_sd_sat_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, d0
+; CHECK-NEXT: ret
+ %i = call i32 @llvm.fptosi.sat.i32.f64(double %a)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtzs_ss_sat_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_ss_sat_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_ss_sat_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: ret
+ %i = call i32 @llvm.fptosi.sat.i32.f32(float %a)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtzs_dd_sat_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_dd_sat_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_dd_sat_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: ret
+ %i = call i64 @llvm.fptosi.sat.i64.f64(double %a)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtzu_sh_sat_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_sh_sat_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_sh_sat_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu s0, h0
+; CHECK-NEXT: ret
+ %i = call i32 @llvm.fptoui.sat.i32.f16(half %a)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtzu_dh_sat_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_dh_sat_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_dh_sat_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, h0
+; CHECK-NEXT: ret
+ %i = call i64 @llvm.fptoui.sat.i64.f16(half %a)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define double @fcvtzu_ds_sat_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_ds_sat_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_ds_sat_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, s0
+; CHECK-NEXT: ret
+ %i = call i64 @llvm.fptoui.sat.i64.f32(float %a)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtzu_sd_sat_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_sd_sat_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_sd_sat_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu s0, d0
+; CHECK-NEXT: ret
+ %i = call i32 @llvm.fptoui.sat.i32.f64(double %a)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtzu_ss_sat_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_ss_sat_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_ss_sat_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: ret
+ %i = call i32 @llvm.fptosi.sat.i32.f32(float %a)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtzu_dd_sat_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_dd_sat_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_dd_sat_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: ret
+ %i = call i64 @llvm.fptosi.sat.i64.f64(double %a)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+;
+; FPTOI saturating with rounding
+;
+
+define float @fcvtas_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_sh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtas_sh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas s0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.round.f16(half %a) nounwind readnone
+ %i = call i32 @llvm.fptosi.sat.i32.f16(half %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtas_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_dh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtas_dh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas d0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.round.f16(half %a) nounwind readnone
+ %i = call i64 @llvm.fptosi.sat.i64.f16(half %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define double @fcvtas_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_ds_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtas_ds_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.round.f32(float %a)
+ %i = call i64 @llvm.fptosi.sat.i64.f32(float %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtas_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_sd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtas_sd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.round.f64(double %a)
+ %i = call i32 @llvm.fptosi.sat.i32.f64(double %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtas_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_ss_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtas_ss_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.round.f32(float %a)
+ %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtas_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtas_dd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtas_dd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.round.f64(double %a)
+ %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtau_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_sh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtau w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtau_sh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtau s0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.round.f16(half %a) nounwind readnone
+ %i = call i32 @llvm.fptoui.sat.i32.f16(half %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtau_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_dh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtau x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtau_dh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtau d0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.round.f16(half %a) nounwind readnone
+ %i = call i64 @llvm.fptoui.sat.i64.f16(half %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define double @fcvtau_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_ds_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtau x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtau_ds_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtau d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.round.f32(float %a)
+ %i = call i64 @llvm.fptoui.sat.i64.f32(float %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtau_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_sd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtau w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtau_sd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtau s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.round.f64(double %a)
+ %i = call i32 @llvm.fptoui.sat.i32.f64(double %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtau_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_ss_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtau_ss_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.round.f32(float %a)
+ %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtau_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtau_dd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtas d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtau_dd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtas d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.round.f64(double %a)
+ %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtms_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_sh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtms_sh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms s0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.floor.f16(half %a) nounwind readnone
+ %i = call i32 @llvm.fptosi.sat.i32.f16(half %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtms_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_dh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtms_dh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms d0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.floor.f16(half %a) nounwind readnone
+ %i = call i64 @llvm.fptosi.sat.i64.f16(half %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define double @fcvtms_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_ds_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtms_ds_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.floor.f32(float %a)
+ %i = call i64 @llvm.fptosi.sat.i64.f32(float %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtms_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_sd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtms_sd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.floor.f64(double %a)
+ %i = call i32 @llvm.fptosi.sat.i32.f64(double %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtms_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_ss_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtms_ss_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.floor.f32(float %a)
+ %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtms_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtms_dd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtms_dd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.floor.f64(double %a)
+ %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtmu_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_sh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtmu w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtmu_sh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtmu s0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.floor.f16(half %a) nounwind readnone
+ %i = call i32 @llvm.fptoui.sat.i32.f16(half %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtmu_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_dh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtmu x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtmu_dh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtmu d0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.floor.f16(half %a) nounwind readnone
+ %i = call i64 @llvm.fptoui.sat.i64.f16(half %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define double @fcvtmu_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_ds_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtmu x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtmu_ds_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtmu d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.floor.f32(float %a)
+ %i = call i64 @llvm.fptoui.sat.i64.f32(float %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtmu_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_sd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtmu w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtmu_sd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtmu s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.floor.f64(double %a)
+ %i = call i32 @llvm.fptoui.sat.i32.f64(double %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtmu_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_ss_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtmu_ss_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.floor.f32(float %a)
+ %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtmu_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtmu_dd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtms d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtmu_dd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtms d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.floor.f64(double %a)
+ %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtps_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_sh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtps_sh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps s0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.ceil.f16(half %a) nounwind readnone
+ %i = call i32 @llvm.fptosi.sat.i32.f16(half %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtps_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_dh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtps_dh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps d0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.ceil.f16(half %a) nounwind readnone
+ %i = call i64 @llvm.fptosi.sat.i64.f16(half %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define double @fcvtps_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_ds_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtps_ds_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.ceil.f32(float %a)
+ %i = call i64 @llvm.fptosi.sat.i64.f32(float %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtps_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_sd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtps_sd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.ceil.f64(double %a)
+ %i = call i32 @llvm.fptosi.sat.i32.f64(double %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtps_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_ss_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtps_ss_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.ceil.f32(float %a)
+ %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtps_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtps_dd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtps_dd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.ceil.f64(double %a)
+ %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtpu_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_sh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtpu w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtpu_sh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtpu s0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.ceil.f16(half %a) nounwind readnone
+ %i = call i32 @llvm.fptoui.sat.i32.f16(half %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtpu_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_dh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtpu x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtpu_dh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtpu d0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.ceil.f16(half %a) nounwind readnone
+ %i = call i64 @llvm.fptoui.sat.i64.f16(half %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define double @fcvtpu_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_ds_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtpu x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtpu_ds_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtpu d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.ceil.f32(float %a)
+ %i = call i64 @llvm.fptoui.sat.i64.f32(float %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtpu_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_sd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtpu w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtpu_sd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtpu s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.ceil.f64(double %a)
+ %i = call i32 @llvm.fptoui.sat.i32.f64(double %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtpu_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_ss_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtpu_ss_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.ceil.f32(float %a)
+ %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtpu_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtpu_dd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtps d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtpu_dd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtps d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.ceil.f64(double %a)
+ %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtzs_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_sh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_sh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.trunc.f16(half %a) nounwind readnone
+ %i = call i32 @llvm.fptosi.sat.i32.f16(half %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtzs_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_dh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_dh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.trunc.f16(half %a) nounwind readnone
+ %i = call i64 @llvm.fptosi.sat.i64.f16(half %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define double @fcvtzs_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_ds_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_ds_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.trunc.f32(float %a)
+ %i = call i64 @llvm.fptosi.sat.i64.f32(float %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtzs_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_sd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_sd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.trunc.f64(double %a)
+ %i = call i32 @llvm.fptosi.sat.i32.f64(double %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtzs_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_ss_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_ss_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.trunc.f32(float %a)
+ %i = call i32 @llvm.fptosi.sat.i32.f32(float %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtzs_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzs_dd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzs d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzs_dd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzs d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.trunc.f64(double %a)
+ %i = call i64 @llvm.fptosi.sat.i64.f64(double %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtzu_sh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_sh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu w8, h0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_sh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu s0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.trunc.f16(half %a) nounwind readnone
+ %i = call i32 @llvm.fptoui.sat.i32.f16(half %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtzu_dh_simd(half %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_dh_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu x8, h0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_dh_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, h0
+; CHECK-NEXT: ret
+ %r = call half @llvm.trunc.f16(half %a) nounwind readnone
+ %i = call i64 @llvm.fptoui.sat.i64.f16(half %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define double @fcvtzu_ds_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_ds_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu x8, s0
+; CHECK-NOFPRCVT-NEXT: fmov d0, x8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_ds_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.trunc.f32(float %a)
+ %i = call i64 @llvm.fptoui.sat.i64.f32(float %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
+
+define float @fcvtzu_sd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_sd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu w8, d0
+; CHECK-NOFPRCVT-NEXT: fmov s0, w8
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_sd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu s0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.trunc.f64(double %a)
+ %i = call i32 @llvm.fptoui.sat.i32.f64(double %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define float @fcvtzu_ss_simd(float %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_ss_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu s0, s0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_ss_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu s0, s0
+; CHECK-NEXT: ret
+ %r = call float @llvm.trunc.f32(float %a)
+ %i = call i32 @llvm.fptoui.sat.i32.f32(float %r)
+ %bc = bitcast i32 %i to float
+ ret float %bc
+}
+
+define double @fcvtzu_dd_simd(double %a) {
+; CHECK-NOFPRCVT-LABEL: fcvtzu_dd_simd:
+; CHECK-NOFPRCVT: // %bb.0:
+; CHECK-NOFPRCVT-NEXT: fcvtzu d0, d0
+; CHECK-NOFPRCVT-NEXT: ret
+;
+; CHECK-LABEL: fcvtzu_dd_simd:
+; CHECK: // %bb.0:
+; CHECK-NEXT: fcvtzu d0, d0
+; CHECK-NEXT: ret
+ %r = call double @llvm.trunc.f64(double %a)
+ %i = call i64 @llvm.fptoui.sat.i64.f64(double %r)
+ %bc = bitcast i64 %i to double
+ ret double %bc
+}
diff --git a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
index e18a5f6..d8f3708 100644
--- a/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-neon-copy.ll
@@ -980,12 +980,18 @@ define <1 x double> @test_bitcasti64tov1f64(i64 %in) {
}
define <1 x i64> @test_bitcastv8i8tov1f64(<8 x i8> %a) #0 {
-; CHECK-LABEL: test_bitcastv8i8tov1f64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: neg v0.8b, v0.8b
-; CHECK-NEXT: fcvtzs x8, d0
-; CHECK-NEXT: fmov d0, x8
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_bitcastv8i8tov1f64:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: neg v0.8b, v0.8b
+; CHECK-SD-NEXT: fcvtzs x8, d0
+; CHECK-SD-NEXT: fmov d0, x8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_bitcastv8i8tov1f64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: neg v0.8b, v0.8b
+; CHECK-GI-NEXT: fcvtzs d0, d0
+; CHECK-GI-NEXT: ret
%sub.i = sub <8 x i8> zeroinitializer, %a
%1 = bitcast <8 x i8> %sub.i to <1 x double>
%vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -993,12 +999,18 @@ define <1 x i64> @test_bitcastv8i8tov1f64(<8 x i8> %a) #0 {
}
define <1 x i64> @test_bitcastv4i16tov1f64(<4 x i16> %a) #0 {
-; CHECK-LABEL: test_bitcastv4i16tov1f64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: neg v0.4h, v0.4h
-; CHECK-NEXT: fcvtzs x8, d0
-; CHECK-NEXT: fmov d0, x8
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_bitcastv4i16tov1f64:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: neg v0.4h, v0.4h
+; CHECK-SD-NEXT: fcvtzs x8, d0
+; CHECK-SD-NEXT: fmov d0, x8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_bitcastv4i16tov1f64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: neg v0.4h, v0.4h
+; CHECK-GI-NEXT: fcvtzs d0, d0
+; CHECK-GI-NEXT: ret
%sub.i = sub <4 x i16> zeroinitializer, %a
%1 = bitcast <4 x i16> %sub.i to <1 x double>
%vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -1006,12 +1018,18 @@ define <1 x i64> @test_bitcastv4i16tov1f64(<4 x i16> %a) #0 {
}
define <1 x i64> @test_bitcastv2i32tov1f64(<2 x i32> %a) #0 {
-; CHECK-LABEL: test_bitcastv2i32tov1f64:
-; CHECK: // %bb.0:
-; CHECK-NEXT: neg v0.2s, v0.2s
-; CHECK-NEXT: fcvtzs x8, d0
-; CHECK-NEXT: fmov d0, x8
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_bitcastv2i32tov1f64:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: neg v0.2s, v0.2s
+; CHECK-SD-NEXT: fcvtzs x8, d0
+; CHECK-SD-NEXT: fmov d0, x8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_bitcastv2i32tov1f64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: neg v0.2s, v0.2s
+; CHECK-GI-NEXT: fcvtzs d0, d0
+; CHECK-GI-NEXT: ret
%sub.i = sub <2 x i32> zeroinitializer, %a
%1 = bitcast <2 x i32> %sub.i to <1 x double>
%vcvt.i = fptosi <1 x double> %1 to <1 x i64>
@@ -1031,8 +1049,7 @@ define <1 x i64> @test_bitcastv1i64tov1f64(<1 x i64> %a) #0 {
; CHECK-GI-NEXT: fmov x8, d0
; CHECK-GI-NEXT: neg x8, x8
; CHECK-GI-NEXT: fmov d0, x8
-; CHECK-GI-NEXT: fcvtzs x8, d0
-; CHECK-GI-NEXT: fmov d0, x8
+; CHECK-GI-NEXT: fcvtzs d0, d0
; CHECK-GI-NEXT: ret
%sub.i = sub <1 x i64> zeroinitializer, %a
%1 = bitcast <1 x i64> %sub.i to <1 x double>
diff --git a/llvm/test/CodeGen/AArch64/arm64-vcvt.ll b/llvm/test/CodeGen/AArch64/arm64-vcvt.ll
index 627d31f..1e0cfa0 100644
--- a/llvm/test/CodeGen/AArch64/arm64-vcvt.ll
+++ b/llvm/test/CodeGen/AArch64/arm64-vcvt.ll
@@ -359,11 +359,16 @@ define <2 x i64> @fcvtzs_2d(<2 x double> %A) nounwind {
; FIXME: Generate "fcvtzs d0, d0"?
define <1 x i64> @fcvtzs_1d(<1 x double> %A) nounwind {
-; CHECK-LABEL: fcvtzs_1d:
-; CHECK: // %bb.0:
-; CHECK-NEXT: fcvtzs x8, d0
-; CHECK-NEXT: fmov d0, x8
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fcvtzs_1d:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: fcvtzs x8, d0
+; CHECK-SD-NEXT: fmov d0, x8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fcvtzs_1d:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: fcvtzs d0, d0
+; CHECK-GI-NEXT: ret
%tmp3 = fptosi <1 x double> %A to <1 x i64>
ret <1 x i64> %tmp3
}
@@ -438,11 +443,16 @@ define <2 x i64> @fcvtzu_2d(<2 x double> %A) nounwind {
; FIXME: Generate "fcvtzu d0, d0"?
define <1 x i64> @fcvtzu_1d(<1 x double> %A) nounwind {
-; CHECK-LABEL: fcvtzu_1d:
-; CHECK: // %bb.0:
-; CHECK-NEXT: fcvtzu x8, d0
-; CHECK-NEXT: fmov d0, x8
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: fcvtzu_1d:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: fcvtzu x8, d0
+; CHECK-SD-NEXT: fmov d0, x8
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: fcvtzu_1d:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: fcvtzu d0, d0
+; CHECK-GI-NEXT: ret
%tmp3 = fptoui <1 x double> %A to <1 x i64>
ret <1 x i64> %tmp3
}
diff --git a/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll b/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll
new file mode 100644
index 0000000..cf52934
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/dup-ext-load-combine.ll
@@ -0,0 +1,178 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=aarch64-none-linux-gnu < %s | FileCheck %s
+
+; Test optimization of DUP with extended narrow loads
+; This should avoid GPR->SIMD transfers by loading directly into vector registers
+
+define <4 x i16> @test_dup_zextload_i8_v4i16(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr b0, [x0]
+; CHECK-NEXT: dup v0.4h, v0.h[0]
+; CHECK-NEXT: ret
+ %load = load i8, ptr %p, align 1
+ %ext = zext i8 %load to i16
+ %vec = insertelement <4 x i16> poison, i16 %ext, i32 0
+ %dup = shufflevector <4 x i16> %vec, <4 x i16> poison, <4 x i32> zeroinitializer
+ ret <4 x i16> %dup
+}
+
+define <8 x i16> @test_dup_zextload_i8_v8i16(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr b0, [x0]
+; CHECK-NEXT: dup v0.8h, v0.h[0]
+; CHECK-NEXT: ret
+ %load = load i8, ptr %p, align 1
+ %ext = zext i8 %load to i16
+ %vec = insertelement <8 x i16> poison, i16 %ext, i32 0
+ %dup = shufflevector <8 x i16> %vec, <8 x i16> poison, <8 x i32> zeroinitializer
+ ret <8 x i16> %dup
+}
+
+define <2 x i32> @test_dup_zextload_i8_v2i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr b0, [x0]
+; CHECK-NEXT: dup v0.2s, v0.s[0]
+; CHECK-NEXT: ret
+ %load = load i8, ptr %p, align 1
+ %ext = zext i8 %load to i32
+ %vec = insertelement <2 x i32> poison, i32 %ext, i32 0
+ %dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer
+ ret <2 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i8_v4i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr b0, [x0]
+; CHECK-NEXT: dup v0.4s, v0.s[0]
+; CHECK-NEXT: ret
+ %load = load i8, ptr %p, align 1
+ %ext = zext i8 %load to i32
+ %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+ %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+ ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i8_v4i32_offset(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i32_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr b0, [x0, #4]
+; CHECK-NEXT: dup v0.4s, v0.s[0]
+; CHECK-NEXT: ret
+ %addr = getelementptr inbounds i8, ptr %p, i64 4
+ %load = load i8, ptr %addr, align 1
+ %ext = zext i8 %load to i32
+ %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+ %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+ ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i8_v4i32_reg_offset(ptr %p, i64 %offset) {
+; CHECK-LABEL: test_dup_zextload_i8_v4i32_reg_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr b0, [x0, x1]
+; CHECK-NEXT: dup v0.4s, v0.s[0]
+; CHECK-NEXT: ret
+ %addr = getelementptr inbounds i8, ptr %p, i64 %offset
+ %load = load i8, ptr %addr, align 1
+ %ext = zext i8 %load to i32
+ %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+ %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+ ret <4 x i32> %dup
+}
+
+define <2 x i64> @test_dup_zextload_i8_v2i64(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i8_v2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr b0, [x0]
+; CHECK-NEXT: dup v0.2d, v0.d[0]
+; CHECK-NEXT: ret
+ %load = load i8, ptr %p, align 1
+ %ext = zext i8 %load to i64
+ %vec = insertelement <2 x i64> poison, i64 %ext, i32 0
+ %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer
+ ret <2 x i64> %dup
+}
+
+define <2 x i32> @test_dup_zextload_i16_v2i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr h0, [x0]
+; CHECK-NEXT: dup v0.2s, v0.s[0]
+; CHECK-NEXT: ret
+ %load = load i16, ptr %p, align 1
+ %ext = zext i16 %load to i32
+ %vec = insertelement <2 x i32> poison, i32 %ext, i32 0
+ %dup = shufflevector <2 x i32> %vec, <2 x i32> poison, <2 x i32> zeroinitializer
+ ret <2 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i16_v4i32(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr h0, [x0]
+; CHECK-NEXT: dup v0.4s, v0.s[0]
+; CHECK-NEXT: ret
+ %load = load i16, ptr %p, align 1
+ %ext = zext i16 %load to i32
+ %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+ %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+ ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i16_v4i32_offset(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v4i32_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr h0, [x0, #8]
+; CHECK-NEXT: dup v0.4s, v0.s[0]
+; CHECK-NEXT: ret
+ %addr = getelementptr inbounds i16, ptr %p, i64 4
+ %load = load i16, ptr %addr, align 1
+ %ext = zext i16 %load to i32
+ %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+ %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+ ret <4 x i32> %dup
+}
+
+define <4 x i32> @test_dup_zextload_i16_v4i32_reg_offset(ptr %p, i64 %offset) {
+; CHECK-LABEL: test_dup_zextload_i16_v4i32_reg_offset:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr h0, [x0, x1, lsl #1]
+; CHECK-NEXT: dup v0.4s, v0.s[0]
+; CHECK-NEXT: ret
+ %addr = getelementptr inbounds i16, ptr %p, i64 %offset
+ %load = load i16, ptr %addr, align 1
+ %ext = zext i16 %load to i32
+ %vec = insertelement <4 x i32> poison, i32 %ext, i32 0
+ %dup = shufflevector <4 x i32> %vec, <4 x i32> poison, <4 x i32> zeroinitializer
+ ret <4 x i32> %dup
+}
+
+define <2 x i64> @test_dup_zextload_i16_v2i64(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i16_v2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr h0, [x0]
+; CHECK-NEXT: dup v0.2d, v0.d[0]
+; CHECK-NEXT: ret
+ %load = load i16, ptr %p, align 1
+ %ext = zext i16 %load to i64
+ %vec = insertelement <2 x i64> poison, i64 %ext, i32 0
+ %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer
+ ret <2 x i64> %dup
+}
+
+define <2 x i64> @test_dup_zextload_i32_v2i64(ptr %p) {
+; CHECK-LABEL: test_dup_zextload_i32_v2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr s0, [x0]
+; CHECK-NEXT: dup v0.2d, v0.d[0]
+; CHECK-NEXT: ret
+ %load = load i32, ptr %p, align 1
+ %ext = zext i32 %load to i64
+ %vec = insertelement <2 x i64> poison, i64 %ext, i32 0
+ %dup = shufflevector <2 x i64> %vec, <2 x i64> poison, <2 x i32> zeroinitializer
+ ret <2 x i64> %dup
+}
diff --git a/llvm/test/CodeGen/AArch64/dup.ll b/llvm/test/CodeGen/AArch64/dup.ll
index 079ff10..670574f2 100644
--- a/llvm/test/CodeGen/AArch64/dup.ll
+++ b/llvm/test/CodeGen/AArch64/dup.ll
@@ -32,8 +32,8 @@ entry:
define <2 x i8> @loaddup_v2i8(ptr %p) {
; CHECK-LABEL: loaddup_v2i8:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0]
-; CHECK-NEXT: dup v0.2s, w8
+; CHECK-NEXT: ldr b0, [x0]
+; CHECK-NEXT: dup v0.2s, v0.s[0]
; CHECK-NEXT: ret
entry:
%a = load i8, ptr %p
@@ -189,8 +189,8 @@ entry:
define <4 x i8> @loaddup_v4i8(ptr %p) {
; CHECK-SD-LABEL: loaddup_v4i8:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ldrb w8, [x0]
-; CHECK-SD-NEXT: dup v0.4h, w8
+; CHECK-SD-NEXT: ldr b0, [x0]
+; CHECK-SD-NEXT: dup v0.4h, v0.h[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: loaddup_v4i8:
@@ -444,8 +444,8 @@ entry:
define <2 x i16> @loaddup_v2i16(ptr %p) {
; CHECK-SD-LABEL: loaddup_v2i16:
; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: ldrh w8, [x0]
-; CHECK-SD-NEXT: dup v0.2s, w8
+; CHECK-SD-NEXT: ldr h0, [x0]
+; CHECK-SD-NEXT: dup v0.2s, v0.s[0]
; CHECK-SD-NEXT: ret
;
; CHECK-GI-LABEL: loaddup_v2i16:
diff --git a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
index c741129..b963acd 100644
--- a/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptosi-sat-vector.ll
@@ -31,8 +31,7 @@ define <1 x i32> @test_signed_v1f32_v1i32(<1 x float> %f) {
;
; CHECK-GI-LABEL: test_signed_v1f32_v1i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: fcvtzs w8, s0
-; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: fcvtzs s0, s0
; CHECK-GI-NEXT: ret
%x = call <1 x i32> @llvm.fptosi.sat.v1f32.v1i32(<1 x float> %f)
ret <1 x i32> %x
@@ -1162,18 +1161,24 @@ declare <7 x i32> @llvm.fptosi.sat.v7f16.v7i32 (<7 x half>)
declare <8 x i32> @llvm.fptosi.sat.v8f16.v8i32 (<8 x half>)
define <1 x i32> @test_signed_v1f16_v1i32(<1 x half> %f) {
-; CHECK-CVT-LABEL: test_signed_v1f16_v1i32:
-; CHECK-CVT: // %bb.0:
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcvtzs w8, s0
-; CHECK-CVT-NEXT: fmov s0, w8
-; CHECK-CVT-NEXT: ret
+; CHECK-SD-CVT-LABEL: test_signed_v1f16_v1i32:
+; CHECK-SD-CVT: // %bb.0:
+; CHECK-SD-CVT-NEXT: fcvt s0, h0
+; CHECK-SD-CVT-NEXT: fcvtzs w8, s0
+; CHECK-SD-CVT-NEXT: fmov s0, w8
+; CHECK-SD-CVT-NEXT: ret
;
; CHECK-FP16-LABEL: test_signed_v1f16_v1i32:
; CHECK-FP16: // %bb.0:
; CHECK-FP16-NEXT: fcvtzs w8, h0
; CHECK-FP16-NEXT: fmov s0, w8
; CHECK-FP16-NEXT: ret
+;
+; CHECK-GI-CVT-LABEL: test_signed_v1f16_v1i32:
+; CHECK-GI-CVT: // %bb.0:
+; CHECK-GI-CVT-NEXT: fcvt s0, h0
+; CHECK-GI-CVT-NEXT: fcvtzs s0, s0
+; CHECK-GI-CVT-NEXT: ret
%x = call <1 x i32> @llvm.fptosi.sat.v1f16.v1i32(<1 x half> %f)
ret <1 x i32> %x
}
diff --git a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
index efe0a1b..5a66b68 100644
--- a/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
+++ b/llvm/test/CodeGen/AArch64/fptoui-sat-vector.ll
@@ -31,8 +31,7 @@ define <1 x i32> @test_unsigned_v1f32_v1i32(<1 x float> %f) {
;
; CHECK-GI-LABEL: test_unsigned_v1f32_v1i32:
; CHECK-GI: // %bb.0:
-; CHECK-GI-NEXT: fcvtzu w8, s0
-; CHECK-GI-NEXT: fmov s0, w8
+; CHECK-GI-NEXT: fcvtzu s0, s0
; CHECK-GI-NEXT: ret
%x = call <1 x i32> @llvm.fptoui.sat.v1f32.v1i32(<1 x float> %f)
ret <1 x i32> %x
@@ -993,18 +992,24 @@ declare <7 x i32> @llvm.fptoui.sat.v7f16.v7i32 (<7 x half>)
declare <8 x i32> @llvm.fptoui.sat.v8f16.v8i32 (<8 x half>)
define <1 x i32> @test_unsigned_v1f16_v1i32(<1 x half> %f) {
-; CHECK-CVT-LABEL: test_unsigned_v1f16_v1i32:
-; CHECK-CVT: // %bb.0:
-; CHECK-CVT-NEXT: fcvt s0, h0
-; CHECK-CVT-NEXT: fcvtzu w8, s0
-; CHECK-CVT-NEXT: fmov s0, w8
-; CHECK-CVT-NEXT: ret
+; CHECK-SD-CVT-LABEL: test_unsigned_v1f16_v1i32:
+; CHECK-SD-CVT: // %bb.0:
+; CHECK-SD-CVT-NEXT: fcvt s0, h0
+; CHECK-SD-CVT-NEXT: fcvtzu w8, s0
+; CHECK-SD-CVT-NEXT: fmov s0, w8
+; CHECK-SD-CVT-NEXT: ret
;
; CHECK-FP16-LABEL: test_unsigned_v1f16_v1i32:
; CHECK-FP16: // %bb.0:
; CHECK-FP16-NEXT: fcvtzu w8, h0
; CHECK-FP16-NEXT: fmov s0, w8
; CHECK-FP16-NEXT: ret
+;
+; CHECK-GI-CVT-LABEL: test_unsigned_v1f16_v1i32:
+; CHECK-GI-CVT: // %bb.0:
+; CHECK-GI-CVT-NEXT: fcvt s0, h0
+; CHECK-GI-CVT-NEXT: fcvtzu s0, s0
+; CHECK-GI-CVT-NEXT: ret
%x = call <1 x i32> @llvm.fptoui.sat.v1f16.v1i32(<1 x half> %f)
ret <1 x i32> %x
}
diff --git a/llvm/test/CodeGen/AArch64/ldst-prepost-uses.ll b/llvm/test/CodeGen/AArch64/ldst-prepost-uses.ll
new file mode 100644
index 0000000..85991fb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ldst-prepost-uses.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -O3 -mtriple=aarch64 | FileCheck %s
+
+; From #164775, this generates a pre-index load feeding a post-index store, that
+; was checking the wrong uses for post-inc. It seems quite delicate for it to
+; generate this combination at the wrong point to hit the same issue.
+
+@g_260 = dso_local global i16 0
+@g_480 = dso_local global i16 0
+
+define i32 @func_1(ptr %l_3253) {
+; CHECK-LABEL: func_1:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: sub sp, sp, #128
+; CHECK-NEXT: .cfi_def_cfa_offset 128
+; CHECK-NEXT: movi v0.2d, #0000000000000000
+; CHECK-NEXT: mov w9, #2 // =0x2
+; CHECK-NEXT: mov w10, #96 // =0x60
+; CHECK-NEXT: strb wzr, [x9]
+; CHECK-NEXT: mov w9, #111 // =0x6f
+; CHECK-NEXT: mov x8, xzr
+; CHECK-NEXT: str wzr, [x9]
+; CHECK-NEXT: mov w9, #80 // =0x50
+; CHECK-NEXT: adrp x1, .L_MergedGlobals
+; CHECK-NEXT: add x1, x1, :lo12:.L_MergedGlobals
+; CHECK-NEXT: strh wzr, [x8]
+; CHECK-NEXT: str q0, [x9]
+; CHECK-NEXT: mov w9, #48 // =0x30
+; CHECK-NEXT: str q0, [x9]
+; CHECK-NEXT: mov w9, #32 // =0x20
+; CHECK-NEXT: str q0, [x10]
+; CHECK-NEXT: mov w10, #64 // =0x40
+; CHECK-NEXT: str q0, [x9]
+; CHECK-NEXT: mov w9, #16 // =0x10
+; CHECK-NEXT: str q0, [x10]
+; CHECK-NEXT: str q0, [x9]
+; CHECK-NEXT: str q0, [x8]
+; CHECK-NEXT: adrp x8, .L_MergedGlobals
+; CHECK-NEXT: strb wzr, [x0, #8]
+; CHECK-NEXT: strb wzr, [x0, #12]
+; CHECK-NEXT: strb wzr, [x0, #16]
+; CHECK-NEXT: strb wzr, [x0, #20]
+; CHECK-NEXT: mov w0, wzr
+; CHECK-NEXT: ldrh wzr, [x8, :lo12:.L_MergedGlobals]
+; CHECK-NEXT: ldrh w8, [x1, #4]!
+; CHECK-NEXT: sub w8, w8, #1
+; CHECK-NEXT: strh w8, [x1]
+; CHECK-NEXT: add sp, sp, #128
+; CHECK-NEXT: b use
+entry:
+ %l_32531.sroa.3 = alloca [3 x i8], align 4
+ %l_32531.sroa.4 = alloca [115 x i8], align 4
+ call void @llvm.lifetime.start.p0(ptr %l_32531.sroa.3)
+ call void @llvm.lifetime.start.p0(ptr %l_32531.sroa.4)
+ call void @llvm.memset.p0.i64(ptr null, i8 0, i64 3, i1 false)
+ call void @llvm.memset.p0.i64(ptr null, i8 0, i64 115, i1 false)
+ %0 = getelementptr inbounds i8, ptr %l_3253, i64 8
+ store i8 0, ptr %0, align 4
+ %1 = getelementptr inbounds i8, ptr %l_3253, i64 12
+ store i8 0, ptr %1, align 4
+ %2 = getelementptr inbounds i8, ptr %l_3253, i64 16
+ store i8 0, ptr %2, align 4
+ %3 = getelementptr inbounds i8, ptr %l_3253, i64 20
+ store i8 0, ptr %3, align 4
+ %4 = load volatile i16, ptr @g_260, align 4
+ %5 = load i16, ptr @g_480, align 4
+ %dec.i.i = add i16 %5, -1
+ store i16 %dec.i.i, ptr @g_480, align 4
+ %call1 = tail call i32 @use(i32 0, ptr @g_480)
+ ret i32 %call1
+}
+
+declare i32 @use(i32, ptr)
diff --git a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
index 9193025..628506b 100644
--- a/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
+++ b/llvm/test/CodeGen/AArch64/load-zext-bitcast.ll
@@ -84,8 +84,7 @@ entry:
define double @load_u64_from_u32_off1(ptr %n){
; CHECK-LABEL: load_u64_from_u32_off1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldur w8, [x0, #1]
-; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ldur s0, [x0, #1]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 1
@@ -98,8 +97,7 @@ entry:
define double @load_u64_from_u16_off1(ptr %n){
; CHECK-LABEL: load_u64_from_u16_off1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldurh w8, [x0, #1]
-; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ldur h0, [x0, #1]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 1
@@ -112,8 +110,7 @@ entry:
define double @load_u64_from_u8_off1(ptr %n){
; CHECK-LABEL: load_u64_from_u8_off1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0, #1]
-; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ldr b0, [x0, #1]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 1
@@ -126,8 +123,7 @@ entry:
define float @load_u32_from_u16_off1(ptr %n){
; CHECK-LABEL: load_u32_from_u16_off1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldurh w8, [x0, #1]
-; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ldur h0, [x0, #1]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 1
@@ -140,8 +136,7 @@ entry:
define float @load_u32_from_u8_off1(ptr %n){
; CHECK-LABEL: load_u32_from_u8_off1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0, #1]
-; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ldr b0, [x0, #1]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 1
@@ -154,8 +149,7 @@ entry:
define half @load_u16_from_u8_off1(ptr %n){
; CHECK-LABEL: load_u16_from_u8_off1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0, #1]
-; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ldr b0, [x0, #1]
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-NEXT: ret
entry:
@@ -171,8 +165,7 @@ entry:
define double @load_u64_from_u32_off2(ptr %n){
; CHECK-LABEL: load_u64_from_u32_off2:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldur w8, [x0, #2]
-; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ldur s0, [x0, #2]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 2
@@ -185,8 +178,7 @@ entry:
define double @load_u64_from_u16_off2(ptr %n){
; CHECK-LABEL: load_u64_from_u16_off2:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrh w8, [x0, #2]
-; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ldr h0, [x0, #2]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 2
@@ -199,8 +191,7 @@ entry:
define double @load_u64_from_u8_off2(ptr %n){
; CHECK-LABEL: load_u64_from_u8_off2:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0, #2]
-; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ldr b0, [x0, #2]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 2
@@ -226,7 +217,7 @@ entry:
define float @load_u32_from_u8_off2(ptr %n){
; CHECK-LABEL: load_u32_from_u8_off2:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #1]
+; CHECK-NEXT: ldr b0, [x0, #2]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 2
@@ -239,7 +230,7 @@ entry:
define half @load_u16_from_u8_off2(ptr %n){
; CHECK-LABEL: load_u16_from_u8_off2:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #1]
+; CHECK-NEXT: ldr b0, [x0, #2]
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-NEXT: ret
entry:
@@ -255,8 +246,7 @@ entry:
define double @load_u64_from_u32_off255(ptr %n){
; CHECK-LABEL: load_u64_from_u32_off255:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldur w8, [x0, #255]
-; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ldur s0, [x0, #255]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 255
@@ -269,8 +259,7 @@ entry:
define double @load_u64_from_u16_off255(ptr %n){
; CHECK-LABEL: load_u64_from_u16_off255:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldurh w8, [x0, #255]
-; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ldur h0, [x0, #255]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 255
@@ -283,8 +272,7 @@ entry:
define double @load_u64_from_u8_off255(ptr %n){
; CHECK-LABEL: load_u64_from_u8_off255:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0, #255]
-; CHECK-NEXT: fmov d0, x8
+; CHECK-NEXT: ldr b0, [x0, #255]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 255
@@ -297,8 +285,7 @@ entry:
define float @load_u32_from_u16_off255(ptr %n){
; CHECK-LABEL: load_u32_from_u16_off255:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldurh w8, [x0, #255]
-; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ldur h0, [x0, #255]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 255
@@ -311,8 +298,7 @@ entry:
define float @load_u32_from_u8_off255(ptr %n){
; CHECK-LABEL: load_u32_from_u8_off255:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0, #255]
-; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ldr b0, [x0, #255]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 255
@@ -325,8 +311,7 @@ entry:
define half @load_u16_from_u8_off255(ptr %n){
; CHECK-LABEL: load_u16_from_u8_off255:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldrb w8, [x0, #255]
-; CHECK-NEXT: fmov s0, w8
+; CHECK-NEXT: ldr b0, [x0, #255]
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-NEXT: ret
entry:
@@ -354,7 +339,7 @@ entry:
define double @load_u64_from_u16_off256(ptr %n){
; CHECK-LABEL: load_u64_from_u16_off256:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr h0, [x0, #128]
+; CHECK-NEXT: ldr h0, [x0, #256]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 256
@@ -367,7 +352,7 @@ entry:
define double @load_u64_from_u8_off256(ptr %n){
; CHECK-LABEL: load_u64_from_u8_off256:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #64]
+; CHECK-NEXT: ldr b0, [x0, #256]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 256
@@ -393,7 +378,7 @@ entry:
define float @load_u32_from_u8_off256(ptr %n){
; CHECK-LABEL: load_u32_from_u8_off256:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #128]
+; CHECK-NEXT: ldr b0, [x0, #256]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 256
@@ -406,7 +391,7 @@ entry:
define half @load_u16_from_u8_off256(ptr %n){
; CHECK-LABEL: load_u16_from_u8_off256:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #128]
+; CHECK-NEXT: ldr b0, [x0, #256]
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-NEXT: ret
entry:
@@ -435,8 +420,7 @@ entry:
define double @load_u64_from_u16_offn(ptr %n){
; CHECK-LABEL: load_u64_from_u16_offn:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: mov w8, #8190 // =0x1ffe
-; CHECK-NEXT: ldr h0, [x0, x8]
+; CHECK-NEXT: ldr h0, [x0, #8190]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 8190
@@ -503,8 +487,8 @@ entry:
define double @load_u64_from_u32_offnp1(ptr %n){
; CHECK-LABEL: load_u64_from_u32_offnp1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: add x8, x0, #4, lsl #12 // =16384
-; CHECK-NEXT: ldr s0, [x8]
+; CHECK-NEXT: mov w8, #16384 // =0x4000
+; CHECK-NEXT: ldr s0, [x0, x8]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 16384
@@ -517,7 +501,8 @@ entry:
define double @load_u64_from_u16_offnp1(ptr %n){
; CHECK-LABEL: load_u64_from_u16_offnp1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr h0, [x0, #4096]
+; CHECK-NEXT: mov w8, #8192 // =0x2000
+; CHECK-NEXT: ldr h0, [x0, x8]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 8192
@@ -530,7 +515,8 @@ entry:
define double @load_u64_from_u8_offnp1(ptr %n){
; CHECK-LABEL: load_u64_from_u8_offnp1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #1024]
+; CHECK-NEXT: mov w8, #4096 // =0x1000
+; CHECK-NEXT: ldr b0, [x0, x8]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 4096
@@ -543,8 +529,8 @@ entry:
define float @load_u32_from_u16_offnp1(ptr %n){
; CHECK-LABEL: load_u32_from_u16_offnp1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: add x8, x0, #2, lsl #12 // =8192
-; CHECK-NEXT: ldr h0, [x8]
+; CHECK-NEXT: mov w8, #8192 // =0x2000
+; CHECK-NEXT: ldr h0, [x0, x8]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 8192
@@ -557,7 +543,8 @@ entry:
define float @load_u32_from_u8_offnp1(ptr %n){
; CHECK-LABEL: load_u32_from_u8_offnp1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #2048]
+; CHECK-NEXT: mov w8, #4096 // =0x1000
+; CHECK-NEXT: ldr b0, [x0, x8]
; CHECK-NEXT: ret
entry:
%p = getelementptr i8, ptr %n, i64 4096
@@ -570,7 +557,8 @@ entry:
define half @load_u16_from_u8_offnp1(ptr %n){
; CHECK-LABEL: load_u16_from_u8_offnp1:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: ldr b0, [x0, #2048]
+; CHECK-NEXT: mov w8, #4096 // =0x1000
+; CHECK-NEXT: ldr b0, [x0, x8]
; CHECK-NEXT: // kill: def $h0 killed $h0 killed $s0
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll b/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll
index 06c53d8..b17b48c 100644
--- a/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll
+++ b/llvm/test/CodeGen/AArch64/sme-intrinsics-rdsvl.ll
@@ -86,4 +86,111 @@ define i64 @sme_cntsd_mul() {
ret i64 %res
}
-declare i64 @llvm.aarch64.sme.cntsd()
+define i64 @sme_cntsb_mul_pos() {
+; CHECK-LABEL: sme_cntsb_mul_pos:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #24
+; CHECK-NEXT: lsl x0, x8, #2
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %shl = shl nuw nsw i64 %v, 3
+ %res = mul nuw nsw i64 %shl, 96
+ ret i64 %res
+}
+
+define i64 @sme_cntsh_mul_pos() {
+; CHECK-LABEL: sme_cntsh_mul_pos:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #3
+; CHECK-NEXT: lsr x0, x8, #1
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %shl = shl nuw nsw i64 %v, 2
+ %res = mul nuw nsw i64 %shl, 3
+ ret i64 %res
+}
+
+define i64 @sme_cntsw_mul_pos() {
+; CHECK-LABEL: sme_cntsw_mul_pos:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #31
+; CHECK-NEXT: lsr x0, x8, #1
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %shl = shl nuw nsw i64 %v, 1
+ %res = mul nuw nsw i64 %shl, 62
+ ret i64 %res
+}
+
+define i64 @sme_cntsd_mul_pos() {
+; CHECK-LABEL: sme_cntsd_mul_pos:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #31
+; CHECK-NEXT: lsl x0, x8, #2
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %res = mul nuw nsw i64 %v, 992
+ ret i64 %res
+}
+
+define i64 @sme_cntsb_mul_neg() {
+; CHECK-LABEL: sme_cntsb_mul_neg:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #-24
+; CHECK-NEXT: lsl x0, x8, #2
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %shl = shl nuw nsw i64 %v, 3
+ %res = mul nuw nsw i64 %shl, -96
+ ret i64 %res
+}
+
+define i64 @sme_cntsh_mul_neg() {
+; CHECK-LABEL: sme_cntsh_mul_neg:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #-3
+; CHECK-NEXT: lsr x0, x8, #1
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %shl = shl nuw nsw i64 %v, 2
+ %res = mul nuw nsw i64 %shl, -3
+ ret i64 %res
+}
+
+define i64 @sme_cntsw_mul_neg() {
+; CHECK-LABEL: sme_cntsw_mul_neg:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #-31
+; CHECK-NEXT: lsl x0, x8, #3
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %shl = shl nuw nsw i64 %v, 1
+ %res = mul nuw nsw i64 %shl, -992
+ ret i64 %res
+}
+
+define i64 @sme_cntsd_mul_neg() {
+; CHECK-LABEL: sme_cntsd_mul_neg:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #-3
+; CHECK-NEXT: lsr x0, x8, #3
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %res = mul nuw nsw i64 %v, -3
+ ret i64 %res
+}
+
+; Negative test for optimization failure
+define i64 @sme_cntsd_mul_fail() {
+; CHECK-LABEL: sme_cntsd_mul_fail:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdsvl x8, #1
+; CHECK-NEXT: mov w9, #993 // =0x3e1
+; CHECK-NEXT: lsr x8, x8, #3
+; CHECK-NEXT: mul x0, x8, x9
+; CHECK-NEXT: ret
+ %v = call i64 @llvm.aarch64.sme.cntsd()
+ %res = mul nuw nsw i64 %v, 993
+ ret i64 %res
+}
+
diff --git a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
index b8d6c88..3f35cb5 100644
--- a/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
+++ b/llvm/test/CodeGen/AArch64/sme-za-exceptions.ll
@@ -829,7 +829,7 @@ define void @try_catch_agnostic_za_invoke() "aarch64_za_state_agnostic" personal
; CHECK-SDAG-NEXT: bl __arm_sme_restore
; CHECK-SDAG-NEXT: b .LBB5_1
entry:
- invoke void @agnostic_za_call()
+ invoke void @agnostic_za_call() "aarch64_za_state_agnostic"
to label %exit unwind label %catch
catch: