aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AArch64
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AArch64')
-rw-r--r--llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll31
-rw-r--r--llvm/test/CodeGen/AArch64/addsub.ll6
-rw-r--r--llvm/test/CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll6
-rw-r--r--llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir2
-rw-r--r--llvm/test/CodeGen/AArch64/framelayout-sve.mir24
-rw-r--r--llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir101
-rw-r--r--llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll32
-rw-r--r--llvm/test/CodeGen/AArch64/sme-streaming-interface.ll32
-rw-r--r--llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll32
-rw-r--r--llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll32
-rw-r--r--llvm/test/CodeGen/AArch64/stack-probing-sve.ll4
-rw-r--r--llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll255
-rw-r--r--llvm/test/CodeGen/AArch64/sve-alloca.ll16
-rw-r--r--llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll32
-rw-r--r--llvm/test/CodeGen/AArch64/sve-tailcall.ll32
-rw-r--r--llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll82
-rw-r--r--llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll45
-rw-r--r--llvm/test/CodeGen/AArch64/unwind-preserved.ll96
19 files changed, 346 insertions, 516 deletions
diff --git a/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll b/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll
new file mode 100644
index 0000000..bfe9ab8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/16bit-float-promotion-with-nofp.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -mtriple=aarch64 -mattr=-fp-armv8 -o - %s | FileCheck %s
+
+define half @f2h(float %a) {
+; CHECK-LABEL: f2h:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: bl __gnu_f2h_ieee
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = fptrunc float %a to half
+ ret half %0
+}
+
+define bfloat @f2bfloat(float %a) {
+; CHECK-LABEL: f2bfloat:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: .cfi_offset w30, -16
+; CHECK-NEXT: bl __truncsfbf2
+; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+entry:
+ %0 = fptrunc float %a to bfloat
+ ret bfloat %0
+}
+
diff --git a/llvm/test/CodeGen/AArch64/addsub.ll b/llvm/test/CodeGen/AArch64/addsub.ll
index 1b86fe6..20215fe9 100644
--- a/llvm/test/CodeGen/AArch64/addsub.ll
+++ b/llvm/test/CodeGen/AArch64/addsub.ll
@@ -662,17 +662,13 @@ define dso_local i32 @_extract_crng_crng() {
; CHECK-NEXT: cmn x8, #1272
; CHECK-NEXT: b.pl .LBB36_3
; CHECK-NEXT: .LBB36_2: // %if.then
-; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w30, -16
; CHECK-NEXT: adrp x8, primary_crng
; CHECK-NEXT: ldr w8, [x8, :lo12:primary_crng]
; CHECK-NEXT: cmp w8, #0
; CHECK-NEXT: adrp x8, input_pool
; CHECK-NEXT: add x8, x8, :lo12:input_pool
; CHECK-NEXT: csel x0, xzr, x8, eq
-; CHECK-NEXT: bl crng_reseed
-; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: b crng_reseed
; CHECK-NEXT: .LBB36_3: // %if.end
; CHECK-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll
index 00ae34b..217f08b 100644
--- a/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll
+++ b/llvm/test/CodeGen/AArch64/arm64ec-entry-thunks-local-linkage.ll
@@ -2,7 +2,8 @@
; Validates when local linkage functions get a thunk generated.
-; Being called does not cause a thunk to be generated.
+; Being called does not cause a thunk to be generated or the symbol name to be mangled.
+; CHECK-NOT: "#does_not_have_addr_taken":
; CHECK-NOT: $ientry_thunk$cdecl$v$f;
define internal void @does_not_have_addr_taken(float) nounwind {
ret void
@@ -12,7 +13,8 @@ define void @calls_does_not_have_addr_taken() nounwind {
ret void
}
-; Having an address taken does cause a thunk to be generated.
+; Having an address taken does cause a thunk to be generated and the symbol name to be mangled.
+; CHECK: "#has_addr_taken":
; CHECK: $ientry_thunk$cdecl$v$i8;
define internal void @has_addr_taken(i64) nounwind {
ret void
diff --git a/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll b/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll
index 94041bf..e601f03 100644
--- a/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll
+++ b/llvm/test/CodeGen/AArch64/callbr-asm-obj-file.ll
@@ -40,7 +40,7 @@ declare dso_local i32 @g(...) local_unnamed_addr
declare dso_local i32 @i(...) local_unnamed_addr
; CHECK-LABEL: <test2>:
-; CHECK: bl {{.*}} <test2+0x18>
+; CHECK: b {{.*}} <test2+0x1c>
; CHECK-LABEL: <$d.5>:
; CHECK-LABEL: <$x.6>:
; CHECK-NEXT: b {{.*}} <test2+0x18>
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir b/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir
index 3dba21d..aed3145 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve-calleesaves-fix.mir
@@ -19,8 +19,8 @@
; CHECK-NEXT: // implicit-def: $p4
; CHECK-NEXT: addvl sp, sp, #1
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
- ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
+ ; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: .cfi_def_cfa wsp, 16
; CHECK-NEXT: .cfi_restore z8
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
index 213d791..f7920e5 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
@@ -772,9 +772,9 @@ body: |
# CHECK: $sp = frame-destroy ADDXri $sp, 32, 0
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22
-# CHECK-NEXT: $z10 = frame-destroy LDR_ZXI $sp, 0
+# CHECK-NEXT: $z10 = frame-destroy LDR_ZXI $sp, 0
# CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 1
-# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2
+# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2
# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
@@ -873,14 +873,14 @@ body: |
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x98, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22
# CHECK: $sp = frame-destroy ADDVL_XXI $sp, 1
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x20, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22
-# CHECK: $p15 = frame-destroy LDR_PXI $sp, 4
-# CHECK: $p14 = frame-destroy LDR_PXI $sp, 5
-# CHECK: $p5 = frame-destroy LDR_PXI $sp, 14
-# CHECK: $p4 = frame-destroy LDR_PXI $sp, 15
# CHECK: $z23 = frame-destroy LDR_ZXI $sp, 2
# CHECK: $z22 = frame-destroy LDR_ZXI $sp, 3
# CHECK: $z9 = frame-destroy LDR_ZXI $sp, 16
# CHECK: $z8 = frame-destroy LDR_ZXI $sp, 17
+# CHECK: $p15 = frame-destroy LDR_PXI $sp, 4
+# CHECK: $p14 = frame-destroy LDR_PXI $sp, 5
+# CHECK: $p5 = frame-destroy LDR_PXI $sp, 14
+# CHECK: $p4 = frame-destroy LDR_PXI $sp, 15
# CHECK: $sp = frame-destroy ADDVL_XXI $sp, 18
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 32
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
@@ -1037,14 +1037,14 @@ body: |
# CHECK-NEXT: $sp = frame-setup ANDXri killed $[[TMP]]
# CHECK: $sp = frame-destroy ADDVL_XXI $fp, -18
+# CHECK: $z23 = frame-destroy LDR_ZXI $sp, 2
+# CHECK-NEXT: $z22 = frame-destroy LDR_ZXI $sp, 3
+# CHECK: $z9 = frame-destroy LDR_ZXI $sp, 16
+# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 17
# CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 4
# CHECK-NEXT: $p14 = frame-destroy LDR_PXI $sp, 5
# CHECK: $p5 = frame-destroy LDR_PXI $sp, 14
# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 15
-# CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 2
-# CHECK-NEXT: $z22 = frame-destroy LDR_ZXI $sp, 3
-# CHECK: $z9 = frame-destroy LDR_ZXI $sp, 16
-# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 17
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z9
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z10
@@ -1198,10 +1198,10 @@ body: |
# CHECK: $sp = frame-destroy ADDVL_XXI $sp, 7
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x18, 0x92, 0x2e, 0x00, 0x1e, 0x22
-# CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 6
-# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 7
# CHECK-NEXT: $z23 = frame-destroy LDR_ZXI $sp, 1
# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2
+# CHECK-NEXT: $p15 = frame-destroy LDR_PXI $sp, 6
+# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 7
# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
diff --git a/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir b/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir
deleted file mode 100644
index de4baec..0000000
--- a/llvm/test/CodeGen/AArch64/reverse-csr-restore-seq.mir
+++ /dev/null
@@ -1,101 +0,0 @@
-# RUN: llc -run-pass=prologepilog -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK
-# RUN: llc -start-before=prologepilog -stop-after=aarch64-ldst-opt -reverse-csr-restore-seq -o - -mtriple=aarch64-- %s | FileCheck %s --check-prefixes=CHECK
-#
---- |
-
- define void @foo() nounwind { entry: unreachable }
-
- define void @bar() nounwind { entry: unreachable }
-
- define void @baz() nounwind { entry: unreachable }
-
-...
----
-name: foo
-# CHECK-LABEL: name: foo
-tracksRegLiveness: true
-body: |
- bb.0:
- $x19 = IMPLICIT_DEF
- $x20 = IMPLICIT_DEF
- $x21 = IMPLICIT_DEF
- $x22 = IMPLICIT_DEF
- $x23 = IMPLICIT_DEF
- $x24 = IMPLICIT_DEF
- $x25 = IMPLICIT_DEF
- $x26 = IMPLICIT_DEF
-
- ; The local stack size is 0, so the last ldp in the sequence will also
- ; restore the stack.
- ; CHECK: $x24, $x23 = frame-destroy LDPXi $sp, 2
- ; CHECK-NEXT: $x22, $x21 = frame-destroy LDPXi $sp, 4
- ; CHECK-NEXT: $x20, $x19 = frame-destroy LDPXi $sp, 6
-
- ; The ldp and the stack increment get merged even before
- ; the load-store optimizer.
- ; CHECK-NEXT: early-clobber $sp, $x26, $x25 = frame-destroy LDPXpost $sp, 8
-
- RET_ReallyLR
-...
----
-name: bar
-# CHECK-LABEL: name: bar
-tracksRegLiveness: true
-stack:
- - { id : 0, size: 8, alignment: 4,
- stack-id: default, callee-saved-register: '', callee-saved-restored: true,
- local-offset: -4, debug-info-variable: '', debug-info-expression: '',
- debug-info-location: '' }
-
-body: |
- bb.0:
- $x19 = IMPLICIT_DEF
- $x20 = IMPLICIT_DEF
- $x21 = IMPLICIT_DEF
- $x22 = IMPLICIT_DEF
- $x23 = IMPLICIT_DEF
- $x24 = IMPLICIT_DEF
- $x25 = IMPLICIT_DEF
- $x26 = IMPLICIT_DEF
-
- ; The local stack size is not 0, and we can combine the CSR stack size with
- ; the local stack size. This results in rewriting the offsets for all the
- ; save/restores and forbids us to merge the stack adjustment and the last pop.
- ; In this case, there is no point of moving the first CSR pair at the end.
- ; We do it anyway, as it's a small price to pay for the resulting
- ; simplification in the epilogue emission code.
- ; CHECK: $x24, $x23 = frame-destroy LDPXi $sp, 4
- ; CHECK-NEXT: $x22, $x21 = frame-destroy LDPXi $sp, 6
- ; CHECK-NEXT: $x20, $x19 = frame-destroy LDPXi $sp, 8
- ; CHECK-NEXT: $x26, $x25 = frame-destroy LDPXi $sp, 2
- ; CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 80, 0
- RET_ReallyLR
-...
----
-# Check that the load from the offset 0 is moved at the end even when hasFP is
-# false.
-name: baz
-# CHECK-LABEL: name: baz
-alignment: 4
-tracksRegLiveness: true
-frameInfo:
- adjustsStack: true
- hasCalls: true
-body: |
- bb.0:
- successors: %bb.1
-
- $x0 = IMPLICIT_DEF
- $x20 = IMPLICIT_DEF
- $x21 = IMPLICIT_DEF
-
- ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
- BL @foo, csr_aarch64_aapcs, implicit-def dead $lr, implicit $sp, implicit $x0, implicit-def $sp, implicit-def $x0
- ADJCALLSTACKUP 0, 0, implicit-def dead $sp, implicit $sp
- B %bb.1
-
- bb.1:
- ; CHECK: $x21, $x20 = frame-destroy LDPXi $sp, 2
- ; CHECK-NEXT: early-clobber $sp, $lr = frame-destroy LDRXpost $sp, 32
- RET_ReallyLR
-...
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
index 296f2be..6d2abf7 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-compatible-interface.ll
@@ -226,30 +226,30 @@ define <vscale x 2 x double> @streaming_compatible_with_scalable_vectors(<vscale
; CHECK-NEXT: ldr z1, [sp] // 16-byte Folded Reload
; CHECK-NEXT: fadd z0.d, z1.d, z0.d
; CHECK-NEXT: addvl sp, sp, #2
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
@@ -318,30 +318,30 @@ define <vscale x 2 x i1> @streaming_compatible_with_predicate_vectors(<vscale x
; CHECK-NEXT: ldr p1, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: and p0.b, p1/z, p1.b, p0.b
; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
index 86918a59..de676ac 100644
--- a/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
+++ b/llvm/test/CodeGen/AArch64/sme-streaming-interface.ll
@@ -187,30 +187,30 @@ define <vscale x 4 x i32> @smstart_clobber_sve(<vscale x 4 x i32> %x) nounwind {
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
@@ -267,30 +267,30 @@ define <vscale x 4 x i32> @smstart_clobber_sve_duplicate(<vscale x 4 x i32> %x)
; CHECK-NEXT: smstop sm
; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #1
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
index b7119fc..ea7808d 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ld1.ll
@@ -129,7 +129,6 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -145,6 +144,7 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale x
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -284,7 +284,6 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -300,6 +299,7 @@ define <vscale x 32 x i8> @ld1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -440,7 +440,6 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -456,6 +455,7 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -595,7 +595,6 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -611,6 +610,7 @@ define <vscale x 16 x i16> @ld1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -751,7 +751,6 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -767,6 +766,7 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -906,7 +906,6 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -922,6 +921,7 @@ define <vscale x 8 x i32> @ld1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused, <
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1062,7 +1062,6 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1078,6 +1077,7 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1217,7 +1217,6 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1233,6 +1232,7 @@ define <vscale x 4 x i64> @ld1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1380,7 +1380,6 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1395,6 +1394,7 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused, <v
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1545,7 +1545,6 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1560,6 +1559,7 @@ define <vscale x 64 x i8> @ld1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %unu
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1711,7 +1711,6 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1726,6 +1725,7 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused,
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1877,7 +1877,6 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1892,6 +1891,7 @@ define <vscale x 32 x i16> @ld1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16> %u
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -2043,7 +2043,6 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -2058,6 +2057,7 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -2209,7 +2209,6 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -2224,6 +2223,7 @@ define <vscale x 16 x i32> @ld1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32> %u
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -2375,7 +2375,6 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -2390,6 +2389,7 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused, <
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -2541,7 +2541,6 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -2556,6 +2555,7 @@ define <vscale x 8 x i64> @ld1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %un
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
index 1fb251a..7e2d28f 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-ldnt1.ll
@@ -82,7 +82,6 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -98,6 +97,7 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8(<vscale x 16 x i8> %unused, <vscale
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -190,7 +190,6 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -206,6 +205,7 @@ define <vscale x 32 x i8> @ldnt1_x2_i8_z0_z8_scalar(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -299,7 +299,6 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -315,6 +314,7 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8(<vscale x 8 x i16> %unused, <vsca
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -407,7 +407,6 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -423,6 +422,7 @@ define <vscale x 16 x i16> @ldnt1_x2_i16_z0_z8_scalar(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -516,7 +516,6 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -532,6 +531,7 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8(<vscale x 4 x i32> %unused, <vscal
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -624,7 +624,6 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -640,6 +639,7 @@ define <vscale x 8 x i32> @ldnt1_x2_i32_z0_z8_scalar(<vscale x 4 x i32> %unused,
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -733,7 +733,6 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -749,6 +748,7 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8(<vscale x 2 x i64> %unused, <vscal
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -841,7 +841,6 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: ldr z0, [sp]
; CONTIGUOUS-NEXT: ldr z1, [sp, #1, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #2
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -857,6 +856,7 @@ define <vscale x 4 x i64> @ldnt1_x2_i64_z0_z8_scalar(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: ldr z11, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #16
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -955,7 +955,6 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -970,6 +969,7 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12(<vscale x 16 x i8> %unused,
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1071,7 +1071,6 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1086,6 +1085,7 @@ define <vscale x 64 x i8> @ldnt1_x4_i8_z0_z4_z8_z12_scalar(<vscale x 16 x i8> %u
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1188,7 +1188,6 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1203,6 +1202,7 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12(<vscale x 8 x i16> %unused
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1304,7 +1304,6 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1319,6 +1318,7 @@ define <vscale x 32 x i16> @ldnt1_x4_i16_z0_z4_z8_z12_scalar(<vscale x 8 x i16>
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1421,7 +1421,6 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1436,6 +1435,7 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12(<vscale x 4 x i32> %unused
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1537,7 +1537,6 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1552,6 +1551,7 @@ define <vscale x 16 x i32> @ldnt1_x4_i32_z0_z4_z8_z12_scalar(<vscale x 4 x i32>
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1654,7 +1654,6 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1669,6 +1668,7 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12(<vscale x 2 x i64> %unused,
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
@@ -1770,7 +1770,6 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
; CONTIGUOUS-NEXT: ldr z2, [sp, #2, mul vl]
; CONTIGUOUS-NEXT: ldr z3, [sp, #3, mul vl]
; CONTIGUOUS-NEXT: addvl sp, sp, #4
-; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -1785,6 +1784,7 @@ define <vscale x 8 x i64> @ldnt1_x4_i64_z0_z4_z8_z12_scalar(<vscale x 2 x i64> %
; CONTIGUOUS-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
; CONTIGUOUS-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CONTIGUOUS-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CONTIGUOUS-NEXT: addvl sp, sp, #15
; CONTIGUOUS-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
; CONTIGUOUS-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
index 1ad7870..56d865e 100644
--- a/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
+++ b/llvm/test/CodeGen/AArch64/stack-probing-sve.ll
@@ -380,7 +380,6 @@ define void @sve_16v_1p_csr(<vscale x 4 x float> %a) #0 {
; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x40, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d15 @ cfa - 16 - 64 * VG
; CHECK-NEXT: //APP
; CHECK-NEXT: //NO_APP
-; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #3, mul vl] // 16-byte Folded Reload
@@ -397,6 +396,7 @@ define void @sve_16v_1p_csr(<vscale x 4 x float> %a) #0 {
; CHECK-NEXT: ldr z10, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #15, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #17
; CHECK-NEXT: .cfi_def_cfa wsp, 16
; CHECK-NEXT: .cfi_restore z8
@@ -697,10 +697,10 @@ define void @sve_unprobed_area(<vscale x 4 x float> %a, i32 %n) #0 {
; CHECK-NEXT: //NO_APP
; CHECK-NEXT: addvl sp, sp, #4
; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x20, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 32 * VG
-; CHECK-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #4
; CHECK-NEXT: .cfi_def_cfa wsp, 16
; CHECK-NEXT: .cfi_restore z8
diff --git a/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll
index 37186cf..9fa5208 100644
--- a/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll
+++ b/llvm/test/CodeGen/AArch64/strictfp_f16_abi_promote.ll
@@ -70,22 +70,20 @@ define void @v3f16_arg(<3 x half> %arg, ptr %ptr) #0 {
; NOFP16-NEXT: .cfi_offset w22, -32
; NOFP16-NEXT: .cfi_offset w30, -48
; NOFP16-NEXT: mov w21, w0
-; NOFP16-NEXT: and w0, w2, #0xffff
+; NOFP16-NEXT: and w0, w1, #0xffff
; NOFP16-NEXT: mov x19, x3
-; NOFP16-NEXT: mov w20, w1
+; NOFP16-NEXT: mov w20, w2
; NOFP16-NEXT: bl __gnu_h2f_ieee
; NOFP16-NEXT: mov w22, w0
; NOFP16-NEXT: and w0, w21, #0xffff
; NOFP16-NEXT: bl __gnu_h2f_ieee
-; NOFP16-NEXT: mov w21, w0
+; NOFP16-NEXT: mov w8, w0
; NOFP16-NEXT: and w0, w20, #0xffff
+; NOFP16-NEXT: orr x21, x8, x22, lsl #32
; NOFP16-NEXT: bl __gnu_h2f_ieee
-; NOFP16-NEXT: mov w8, w21
-; NOFP16-NEXT: // kill: def $w0 killed $w0 def $x0
-; NOFP16-NEXT: str w22, [x19, #8]
-; NOFP16-NEXT: orr x8, x8, x0, lsl #32
+; NOFP16-NEXT: str x21, [x19]
; NOFP16-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; NOFP16-NEXT: str x8, [x19]
+; NOFP16-NEXT: str w0, [x19, #8]
; NOFP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
; NOFP16-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload
; NOFP16-NEXT: ret
@@ -133,26 +131,107 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 {
ret void
}
-; FIXME:
-; define half @f16_return(float %arg) #0 {
-; %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
-; ret half %fptrunc
-; }
+ define half @f16_return(float %arg) #0 {
+; NOFP16-LABEL: f16_return:
+; NOFP16: // %bb.0:
+; NOFP16-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
+; NOFP16-NEXT: .cfi_def_cfa_offset 16
+; NOFP16-NEXT: .cfi_offset w30, -16
+; NOFP16-NEXT: bl __gnu_f2h_ieee
+; NOFP16-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; NOFP16-NEXT: ret
+ %fptrunc = call half @llvm.experimental.constrained.fptrunc.f16.f32(float %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+ ret half %fptrunc
+ }
-; define <2 x half> @v2f16_return(<2 x float> %arg) #0 {
-; %fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
-; ret <2 x half> %fptrunc
-; }
+ define <2 x half> @v2f16_return(<2 x float> %arg) #0 {
+; NOFP16-LABEL: v2f16_return:
+; NOFP16: // %bb.0:
+; NOFP16-NEXT: str x30, [sp, #-32]! // 8-byte Folded Spill
+; NOFP16-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; NOFP16-NEXT: .cfi_def_cfa_offset 32
+; NOFP16-NEXT: .cfi_offset w19, -8
+; NOFP16-NEXT: .cfi_offset w20, -16
+; NOFP16-NEXT: .cfi_offset w30, -32
+; NOFP16-NEXT: mov w19, w0
+; NOFP16-NEXT: mov w0, w1
+; NOFP16-NEXT: bl __gnu_f2h_ieee
+; NOFP16-NEXT: mov w20, w0
+; NOFP16-NEXT: mov w0, w19
+; NOFP16-NEXT: bl __gnu_f2h_ieee
+; NOFP16-NEXT: mov w1, w20
+; NOFP16-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; NOFP16-NEXT: ldr x30, [sp], #32 // 8-byte Folded Reload
+; NOFP16-NEXT: ret
+ %fptrunc = call <2 x half> @llvm.experimental.constrained.fptrunc.v2f16.v2f32(<2 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+ ret <2 x half> %fptrunc
+ }
-; define <3 x half> @v3f16_return(<3 x float> %arg) #0 {
-; %fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
-; ret <3 x half> %fptrunc
-; }
+ define <3 x half> @v3f16_return(<3 x float> %arg) #0 {
+; NOFP16-LABEL: v3f16_return:
+; NOFP16: // %bb.0:
+; NOFP16-NEXT: stp x30, x21, [sp, #-32]! // 16-byte Folded Spill
+; NOFP16-NEXT: stp x20, x19, [sp, #16] // 16-byte Folded Spill
+; NOFP16-NEXT: .cfi_def_cfa_offset 32
+; NOFP16-NEXT: .cfi_offset w19, -8
+; NOFP16-NEXT: .cfi_offset w20, -16
+; NOFP16-NEXT: .cfi_offset w21, -24
+; NOFP16-NEXT: .cfi_offset w30, -32
+; NOFP16-NEXT: mov w20, w0
+; NOFP16-NEXT: mov w0, w2
+; NOFP16-NEXT: mov w19, w1
+; NOFP16-NEXT: bl __gnu_f2h_ieee
+; NOFP16-NEXT: mov w21, w0
+; NOFP16-NEXT: mov w0, w19
+; NOFP16-NEXT: bl __gnu_f2h_ieee
+; NOFP16-NEXT: mov w19, w0
+; NOFP16-NEXT: mov w0, w20
+; NOFP16-NEXT: bl __gnu_f2h_ieee
+; NOFP16-NEXT: mov w1, w19
+; NOFP16-NEXT: ldp x20, x19, [sp, #16] // 16-byte Folded Reload
+; NOFP16-NEXT: mov w2, w21
+; NOFP16-NEXT: ldp x30, x21, [sp], #32 // 16-byte Folded Reload
+; NOFP16-NEXT: ret
+ %fptrunc = call <3 x half> @llvm.experimental.constrained.fptrunc.v3f16.v3f32(<3 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+ ret <3 x half> %fptrunc
+ }
-; define <4 x half> @v4f16_return(<4 x float> %arg) #0 {
-; %fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
-; ret <4 x half> %fptrunc
-; }
+ define <4 x half> @v4f16_return(<4 x float> %arg) #0 {
+; NOFP16-LABEL: v4f16_return:
+; NOFP16: // %bb.0:
+; NOFP16-NEXT: str x30, [sp, #-48]! // 8-byte Folded Spill
+; NOFP16-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
+; NOFP16-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
+; NOFP16-NEXT: .cfi_def_cfa_offset 48
+; NOFP16-NEXT: .cfi_offset w19, -8
+; NOFP16-NEXT: .cfi_offset w20, -16
+; NOFP16-NEXT: .cfi_offset w21, -24
+; NOFP16-NEXT: .cfi_offset w22, -32
+; NOFP16-NEXT: .cfi_offset w30, -48
+; NOFP16-NEXT: mov w21, w0
+; NOFP16-NEXT: mov w0, w3
+; NOFP16-NEXT: mov w19, w2
+; NOFP16-NEXT: mov w20, w1
+; NOFP16-NEXT: bl __gnu_f2h_ieee
+; NOFP16-NEXT: mov w22, w0
+; NOFP16-NEXT: mov w0, w19
+; NOFP16-NEXT: bl __gnu_f2h_ieee
+; NOFP16-NEXT: mov w19, w0
+; NOFP16-NEXT: mov w0, w20
+; NOFP16-NEXT: bl __gnu_f2h_ieee
+; NOFP16-NEXT: mov w20, w0
+; NOFP16-NEXT: mov w0, w21
+; NOFP16-NEXT: bl __gnu_f2h_ieee
+; NOFP16-NEXT: mov w1, w20
+; NOFP16-NEXT: mov w2, w19
+; NOFP16-NEXT: mov w3, w22
+; NOFP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
+; NOFP16-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload
+; NOFP16-NEXT: ldr x30, [sp], #48 // 8-byte Folded Reload
+; NOFP16-NEXT: ret
+ %fptrunc = call <4 x half> @llvm.experimental.constrained.fptrunc.v4f16.v4f32(<4 x float> %arg, metadata !"round.tonearest", metadata !"fpexcept.strict")
+ ret <4 x half> %fptrunc
+ }
; FIXME:
; define void @outgoing_f16_arg(ptr %ptr) #0 {
@@ -182,46 +261,17 @@ define void @v4f16_arg(<4 x half> %arg, ptr %ptr) #0 {
define void @outgoing_v4f16_return(ptr %ptr) #0 {
; NOFP16-LABEL: outgoing_v4f16_return:
; NOFP16: // %bb.0:
-; NOFP16-NEXT: stp x30, x23, [sp, #-48]! // 16-byte Folded Spill
-; NOFP16-NEXT: stp x22, x21, [sp, #16] // 16-byte Folded Spill
-; NOFP16-NEXT: stp x20, x19, [sp, #32] // 16-byte Folded Spill
-; NOFP16-NEXT: .cfi_def_cfa_offset 48
+; NOFP16-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; NOFP16-NEXT: .cfi_def_cfa_offset 16
; NOFP16-NEXT: .cfi_offset w19, -8
-; NOFP16-NEXT: .cfi_offset w20, -16
-; NOFP16-NEXT: .cfi_offset w21, -24
-; NOFP16-NEXT: .cfi_offset w22, -32
-; NOFP16-NEXT: .cfi_offset w23, -40
-; NOFP16-NEXT: .cfi_offset w30, -48
+; NOFP16-NEXT: .cfi_offset w30, -16
; NOFP16-NEXT: mov x19, x0
; NOFP16-NEXT: bl v4f16_result
-; NOFP16-NEXT: and w0, w0, #0xffff
-; NOFP16-NEXT: mov w20, w1
-; NOFP16-NEXT: mov w21, w2
-; NOFP16-NEXT: mov w22, w3
-; NOFP16-NEXT: bl __gnu_h2f_ieee
-; NOFP16-NEXT: mov w23, w0
-; NOFP16-NEXT: and w0, w20, #0xffff
-; NOFP16-NEXT: bl __gnu_h2f_ieee
-; NOFP16-NEXT: mov w20, w0
-; NOFP16-NEXT: and w0, w21, #0xffff
-; NOFP16-NEXT: bl __gnu_h2f_ieee
-; NOFP16-NEXT: mov w21, w0
-; NOFP16-NEXT: and w0, w22, #0xffff
-; NOFP16-NEXT: bl __gnu_h2f_ieee
-; NOFP16-NEXT: bl __gnu_f2h_ieee
-; NOFP16-NEXT: strh w0, [x19, #6]
-; NOFP16-NEXT: mov w0, w21
-; NOFP16-NEXT: bl __gnu_f2h_ieee
-; NOFP16-NEXT: strh w0, [x19, #4]
-; NOFP16-NEXT: mov w0, w20
-; NOFP16-NEXT: bl __gnu_f2h_ieee
-; NOFP16-NEXT: strh w0, [x19, #2]
-; NOFP16-NEXT: mov w0, w23
-; NOFP16-NEXT: bl __gnu_f2h_ieee
+; NOFP16-NEXT: strh w2, [x19, #4]
+; NOFP16-NEXT: strh w3, [x19, #6]
+; NOFP16-NEXT: strh w1, [x19, #2]
; NOFP16-NEXT: strh w0, [x19]
-; NOFP16-NEXT: ldp x20, x19, [sp, #32] // 16-byte Folded Reload
-; NOFP16-NEXT: ldp x22, x21, [sp, #16] // 16-byte Folded Reload
-; NOFP16-NEXT: ldp x30, x23, [sp], #48 // 16-byte Folded Reload
+; NOFP16-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload
; NOFP16-NEXT: ret
%val = call <4 x half> @v4f16_result()
store <4 x half> %val, ptr %ptr
@@ -231,82 +281,21 @@ define void @outgoing_v4f16_return(ptr %ptr) #0 {
define void @outgoing_v8f16_return(ptr %ptr) #0 {
; NOFP16-LABEL: outgoing_v8f16_return:
; NOFP16: // %bb.0:
-; NOFP16-NEXT: stp x30, x27, [sp, #-80]! // 16-byte Folded Spill
-; NOFP16-NEXT: stp x26, x25, [sp, #16] // 16-byte Folded Spill
-; NOFP16-NEXT: stp x24, x23, [sp, #32] // 16-byte Folded Spill
-; NOFP16-NEXT: stp x22, x21, [sp, #48] // 16-byte Folded Spill
-; NOFP16-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; NOFP16-NEXT: .cfi_def_cfa_offset 80
+; NOFP16-NEXT: stp x30, x19, [sp, #-16]! // 16-byte Folded Spill
+; NOFP16-NEXT: .cfi_def_cfa_offset 16
; NOFP16-NEXT: .cfi_offset w19, -8
-; NOFP16-NEXT: .cfi_offset w20, -16
-; NOFP16-NEXT: .cfi_offset w21, -24
-; NOFP16-NEXT: .cfi_offset w22, -32
-; NOFP16-NEXT: .cfi_offset w23, -40
-; NOFP16-NEXT: .cfi_offset w24, -48
-; NOFP16-NEXT: .cfi_offset w25, -56
-; NOFP16-NEXT: .cfi_offset w26, -64
-; NOFP16-NEXT: .cfi_offset w27, -72
-; NOFP16-NEXT: .cfi_offset w30, -80
+; NOFP16-NEXT: .cfi_offset w30, -16
; NOFP16-NEXT: mov x19, x0
; NOFP16-NEXT: bl v8f16_result
-; NOFP16-NEXT: and w0, w0, #0xffff
-; NOFP16-NEXT: mov w21, w1
-; NOFP16-NEXT: mov w22, w2
-; NOFP16-NEXT: mov w23, w3
-; NOFP16-NEXT: mov w24, w4
-; NOFP16-NEXT: mov w25, w5
-; NOFP16-NEXT: mov w26, w6
-; NOFP16-NEXT: mov w27, w7
-; NOFP16-NEXT: bl __gnu_h2f_ieee
-; NOFP16-NEXT: mov w20, w0
-; NOFP16-NEXT: and w0, w21, #0xffff
-; NOFP16-NEXT: bl __gnu_h2f_ieee
-; NOFP16-NEXT: mov w21, w0
-; NOFP16-NEXT: and w0, w22, #0xffff
-; NOFP16-NEXT: bl __gnu_h2f_ieee
-; NOFP16-NEXT: mov w22, w0
-; NOFP16-NEXT: and w0, w23, #0xffff
-; NOFP16-NEXT: bl __gnu_h2f_ieee
-; NOFP16-NEXT: mov w23, w0
-; NOFP16-NEXT: and w0, w24, #0xffff
-; NOFP16-NEXT: bl __gnu_h2f_ieee
-; NOFP16-NEXT: mov w24, w0
-; NOFP16-NEXT: and w0, w25, #0xffff
-; NOFP16-NEXT: bl __gnu_h2f_ieee
-; NOFP16-NEXT: mov w25, w0
-; NOFP16-NEXT: and w0, w26, #0xffff
-; NOFP16-NEXT: bl __gnu_h2f_ieee
-; NOFP16-NEXT: mov w26, w0
-; NOFP16-NEXT: and w0, w27, #0xffff
-; NOFP16-NEXT: bl __gnu_h2f_ieee
-; NOFP16-NEXT: bl __gnu_f2h_ieee
-; NOFP16-NEXT: strh w0, [x19, #14]
-; NOFP16-NEXT: mov w0, w26
-; NOFP16-NEXT: bl __gnu_f2h_ieee
-; NOFP16-NEXT: strh w0, [x19, #12]
-; NOFP16-NEXT: mov w0, w25
-; NOFP16-NEXT: bl __gnu_f2h_ieee
-; NOFP16-NEXT: strh w0, [x19, #10]
-; NOFP16-NEXT: mov w0, w24
-; NOFP16-NEXT: bl __gnu_f2h_ieee
-; NOFP16-NEXT: strh w0, [x19, #8]
-; NOFP16-NEXT: mov w0, w23
-; NOFP16-NEXT: bl __gnu_f2h_ieee
-; NOFP16-NEXT: strh w0, [x19, #6]
-; NOFP16-NEXT: mov w0, w22
-; NOFP16-NEXT: bl __gnu_f2h_ieee
-; NOFP16-NEXT: strh w0, [x19, #4]
-; NOFP16-NEXT: mov w0, w21
-; NOFP16-NEXT: bl __gnu_f2h_ieee
-; NOFP16-NEXT: strh w0, [x19, #2]
-; NOFP16-NEXT: mov w0, w20
-; NOFP16-NEXT: bl __gnu_f2h_ieee
+; NOFP16-NEXT: strh w5, [x19, #10]
+; NOFP16-NEXT: strh w7, [x19, #14]
+; NOFP16-NEXT: strh w6, [x19, #12]
+; NOFP16-NEXT: strh w4, [x19, #8]
+; NOFP16-NEXT: strh w3, [x19, #6]
+; NOFP16-NEXT: strh w2, [x19, #4]
+; NOFP16-NEXT: strh w1, [x19, #2]
; NOFP16-NEXT: strh w0, [x19]
-; NOFP16-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; NOFP16-NEXT: ldp x22, x21, [sp, #48] // 16-byte Folded Reload
-; NOFP16-NEXT: ldp x24, x23, [sp, #32] // 16-byte Folded Reload
-; NOFP16-NEXT: ldp x26, x25, [sp, #16] // 16-byte Folded Reload
-; NOFP16-NEXT: ldp x30, x27, [sp], #80 // 16-byte Folded Reload
+; NOFP16-NEXT: ldp x30, x19, [sp], #16 // 16-byte Folded Reload
; NOFP16-NEXT: ret
%val = call <8 x half> @v8f16_result()
store <8 x half> %val, ptr %ptr
diff --git a/llvm/test/CodeGen/AArch64/sve-alloca.ll b/llvm/test/CodeGen/AArch64/sve-alloca.ll
index 47e49b8..d227538 100644
--- a/llvm/test/CodeGen/AArch64/sve-alloca.ll
+++ b/llvm/test/CodeGen/AArch64/sve-alloca.ll
@@ -66,30 +66,30 @@ define void @foo(<vscale x 4 x i64> %dst, i1 %cond) {
; CHECK-NEXT: st1d { z0.d }, p0, [x0]
; CHECK-NEXT: bl bar
; CHECK-NEXT: addvl sp, x29, #-18
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
index 9851583..3965af6 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-mixed.ll
@@ -567,30 +567,30 @@ define <vscale x 4 x float> @sve_caller_non_sve_callee_high_range(<vscale x 4 x
; CHECK-NEXT: bl non_sve_callee_high_range
; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #3
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
@@ -659,30 +659,30 @@ define <vscale x 4 x float> @sve_ret_caller_non_sve_callee_high_range() {
; CHECK-NEXT: fmov s7, #7.00000000
; CHECK-NEXT: bl non_sve_callee_high_range
; CHECK-NEXT: addvl sp, sp, #2
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve-tailcall.ll b/llvm/test/CodeGen/AArch64/sve-tailcall.ll
index f32c80d..4ddf007 100644
--- a/llvm/test/CodeGen/AArch64/sve-tailcall.ll
+++ b/llvm/test/CodeGen/AArch64/sve-tailcall.ll
@@ -83,30 +83,30 @@ define i32 @sve_caller_non_sve_callee(<vscale x 4 x i32> %arg) nounwind {
; CHECK-NEXT: //APP
; CHECK-NEXT: //NO_APP
; CHECK-NEXT: bl non_sve_callee
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
@@ -158,30 +158,30 @@ define i32 @sve_caller_non_sve_callee_fastcc(<vscale x 4 x i32> %arg) nounwind {
; CHECK-NEXT: //APP
; CHECK-NEXT: //NO_APP
; CHECK-NEXT: bl non_sve_callee
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll
index 7cae1d2..a592dcd 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-pred.ll
@@ -4,12 +4,7 @@
define <vscale x 16 x i1> @test_pmov_to_pred_i8(<vscale x 16 x i8> %zn) {
; CHECK-LABEL: test_pmov_to_pred_i8:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: mov w0, wzr
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv16i8
-; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: pmov p0.b, z0
; CHECK-NEXT: ret
entry:
%res = call <vscale x 16 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv16i8(<vscale x 16 x i8> %zn, i32 0)
@@ -19,27 +14,10 @@ define <vscale x 16 x i1> @test_pmov_to_pred_i8(<vscale x 16 x i8> %zn) {
define <vscale x 8 x i1> @test_pmov_to_pred_i16(<vscale x 8 x i16> %zn) {
; CHECK-LABEL: test_pmov_to_pred_i16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-2
-; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEXT: .cfi_offset w30, -8
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEXT: mov w0, wzr
-; CHECK-NEXT: mov z8.d, z0.d
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16
-; CHECK-NEXT: mov z0.d, z8.d
-; CHECK-NEXT: mov w0, #1 // =0x1
-; CHECK-NEXT: mov p4.b, p0.b
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16
-; CHECK-NEXT: ptrue p1.h
-; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: eor p0.b, p1/z, p4.b, p0.b
-; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #2
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: pmov p1.h, z0[0]
+; CHECK-NEXT: pmov p2.h, z0[1]
+; CHECK-NEXT: eor p0.b, p0/z, p1.b, p2.b
; CHECK-NEXT: ret
entry:
%res1 = call <vscale x 8 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv8i16(<vscale x 8 x i16> %zn, i32 0)
@@ -52,27 +30,10 @@ define <vscale x 8 x i1> @test_pmov_to_pred_i16(<vscale x 8 x i16> %zn) {
define <vscale x 4 x i1> @test_pmov_to_pred_i32(<vscale x 4 x i32> %zn) {
; CHECK-LABEL: test_pmov_to_pred_i32:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-2
-; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEXT: .cfi_offset w30, -8
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEXT: mov w0, wzr
-; CHECK-NEXT: mov z8.d, z0.d
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32
-; CHECK-NEXT: mov z0.d, z8.d
-; CHECK-NEXT: mov w0, #3 // =0x3
-; CHECK-NEXT: mov p4.b, p0.b
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32
-; CHECK-NEXT: ptrue p1.s
-; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: eor p0.b, p1/z, p4.b, p0.b
-; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #2
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: pmov p1.s, z0[0]
+; CHECK-NEXT: pmov p2.s, z0[3]
+; CHECK-NEXT: eor p0.b, p0/z, p1.b, p2.b
; CHECK-NEXT: ret
entry:
%res1 = call <vscale x 4 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv4i32(<vscale x 4 x i32> %zn, i32 0)
@@ -85,27 +46,10 @@ define <vscale x 4 x i1> @test_pmov_to_pred_i32(<vscale x 4 x i32> %zn) {
define <vscale x 2 x i1> @test_pmov_to_pred_i64(<vscale x 2 x i64> %zn) {
; CHECK-LABEL: test_pmov_to_pred_i64:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK-NEXT: addvl sp, sp, #-2
-; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK-NEXT: str z8, [sp, #1, mul vl] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_escape 0x0f, 0x0c, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x10, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 16 * VG
-; CHECK-NEXT: .cfi_offset w30, -8
-; CHECK-NEXT: .cfi_offset w29, -16
-; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0a, 0x11, 0x70, 0x22, 0x11, 0x78, 0x92, 0x2e, 0x00, 0x1e, 0x22 // $d8 @ cfa - 16 - 8 * VG
-; CHECK-NEXT: mov w0, wzr
-; CHECK-NEXT: mov z8.d, z0.d
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64
-; CHECK-NEXT: mov z0.d, z8.d
-; CHECK-NEXT: mov w0, #7 // =0x7
-; CHECK-NEXT: mov p4.b, p0.b
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64
-; CHECK-NEXT: ptrue p1.d
-; CHECK-NEXT: ldr z8, [sp, #1, mul vl] // 16-byte Folded Reload
-; CHECK-NEXT: eor p0.b, p1/z, p4.b, p0.b
-; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: addvl sp, sp, #2
-; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: pmov p1.d, z0[0]
+; CHECK-NEXT: pmov p2.d, z0[7]
+; CHECK-NEXT: eor p0.b, p0/z, p1.b, p2.b
; CHECK-NEXT: ret
entry:
%res1 = call <vscale x 2 x i1> @llvm.aarch64.sve.pmov.to.pred.lane.nxv2i64(<vscale x 2 x i64> %zn, i32 0)
diff --git a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll
index 58b240b..b7f36c6 100644
--- a/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll
+++ b/llvm/test/CodeGen/AArch64/sve2p1-intrinsics-pmov-to-vector.ll
@@ -6,12 +6,7 @@
define <vscale x 8 x i16> @test_pmov_to_vector_i16(<vscale x 8 x i16> %zn, <vscale x 8 x i1> %pn) {
; CHECK-LABEL: test_pmov_to_vector_i16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: mov w0, #1 // =0x1
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv8i16
-; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: pmov z0[1], p0.h
; CHECK-NEXT: ret
entry:
%res = call <vscale x 8 x i16> @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv8i16(<vscale x 8 x i16> %zn, <vscale x 8 x i1> %pn, i32 1)
@@ -21,12 +16,7 @@ define <vscale x 8 x i16> @test_pmov_to_vector_i16(<vscale x 8 x i16> %zn, <vsca
define <vscale x 4 x i32> @test_pmov_to_vector_i32(<vscale x 4 x i32> %zn, <vscale x 4 x i1> %pn) {
; CHECK-LABEL: test_pmov_to_vector_i32:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: mov w0, #3 // =0x3
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv4i32
-; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: pmov z0[3], p0.s
; CHECK-NEXT: ret
entry:
%res = call <vscale x 4 x i32> @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv4i32(<vscale x 4 x i32> %zn, <vscale x 4 x i1> %pn, i32 3)
@@ -36,12 +26,7 @@ define <vscale x 4 x i32> @test_pmov_to_vector_i32(<vscale x 4 x i32> %zn, <vsca
define <vscale x 2 x i64> @test_pmov_to_vector_i64(<vscale x 2 x i64> %zn, <vscale x 2 x i1> %pn) {
; CHECK-LABEL: test_pmov_to_vector_i64:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: mov w0, #7 // =0x7
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv2i64
-; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: pmov z0[7], p0.d
; CHECK-NEXT: ret
entry:
%res = call <vscale x 2 x i64> @llvm.aarch64.sve.pmov.to.vector.lane.merging.nxv2i64(<vscale x 2 x i64> %zn, <vscale x 2 x i1> %pn, i32 7)
@@ -54,11 +39,7 @@ define <vscale x 2 x i64> @test_pmov_to_vector_i64(<vscale x 2 x i64> %zn, <vsca
define <vscale x 16 x i8> @test_pmov_to_vector_zero_i8(<vscale x 16 x i1> %pn) {
; CHECK-LABEL: test_pmov_to_vector_zero_i8:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv16i8
-; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: pmov z0, p0.b
; CHECK-NEXT: ret
entry:
%res = call <vscale x 16 x i8> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv16i8(<vscale x 16 x i1> %pn)
@@ -68,11 +49,7 @@ define <vscale x 16 x i8> @test_pmov_to_vector_zero_i8(<vscale x 16 x i1> %pn) {
define <vscale x 8 x i16> @test_pmov_to_vector_zero_i16(<vscale x 8 x i1> %pn) {
; CHECK-LABEL: test_pmov_to_vector_zero_i16:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv8i16
-; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: pmov z0[0], p0.h
; CHECK-NEXT: ret
entry:
%res = call <vscale x 8 x i16> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv8i16(<vscale x 8 x i1> %pn)
@@ -82,11 +59,7 @@ define <vscale x 8 x i16> @test_pmov_to_vector_zero_i16(<vscale x 8 x i1> %pn) {
define <vscale x 4 x i32> @test_pmov_to_vector_zero_i32(<vscale x 4 x i1> %pn) {
; CHECK-LABEL: test_pmov_to_vector_zero_i32:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv4i32
-; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: pmov z0[0], p0.s
; CHECK-NEXT: ret
entry:
%res = call <vscale x 4 x i32> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv4i32(<vscale x 4 x i1> %pn)
@@ -96,11 +69,7 @@ define <vscale x 4 x i32> @test_pmov_to_vector_zero_i32(<vscale x 4 x i1> %pn) {
define <vscale x 2 x i64> @test_pmov_to_vector_zero_i64(<vscale x 2 x i1> %pn) {
; CHECK-LABEL: test_pmov_to_vector_zero_i64:
; CHECK: // %bb.0: // %entry
-; CHECK-NEXT: str x30, [sp, #-16]! // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 16
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: bl llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv2i64
-; CHECK-NEXT: ldr x30, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: pmov z0[0], p0.d
; CHECK-NEXT: ret
entry:
%res = call <vscale x 2 x i64> @llvm.aarch64.sve.pmov.to.vector.lane.zeroing.nxv2i64(<vscale x 2 x i1> %pn)
diff --git a/llvm/test/CodeGen/AArch64/unwind-preserved.ll b/llvm/test/CodeGen/AArch64/unwind-preserved.ll
index f3c4d217..822be14 100644
--- a/llvm/test/CodeGen/AArch64/unwind-preserved.ll
+++ b/llvm/test/CodeGen/AArch64/unwind-preserved.ll
@@ -63,18 +63,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
; CHECK-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -91,6 +79,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #18
; CHECK-NEXT: .cfi_def_cfa wsp, 16
; CHECK-NEXT: .cfi_restore z8
@@ -112,18 +112,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
; CHECK-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #2
; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
-; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -140,6 +128,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
; CHECK-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; CHECK-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; CHECK-NEXT: addvl sp, sp, #18
; CHECK-NEXT: .cfi_def_cfa wsp, 16
; CHECK-NEXT: .cfi_restore z8
@@ -215,18 +215,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
; GISEL-NEXT: ldr z0, [sp, #1, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: addvl sp, sp, #2
; GISEL-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
-; GISEL-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; GISEL-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -243,6 +231,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
; GISEL-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; GISEL-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; GISEL-NEXT: addvl sp, sp, #18
; GISEL-NEXT: .cfi_def_cfa wsp, 16
; GISEL-NEXT: .cfi_restore z8
@@ -264,18 +264,6 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
; GISEL-NEXT: ldr z0, [sp] // 16-byte Folded Reload
; GISEL-NEXT: addvl sp, sp, #2
; GISEL-NEXT: .cfi_escape 0x0f, 0x0d, 0x8f, 0x00, 0x11, 0x10, 0x22, 0x11, 0x90, 0x01, 0x92, 0x2e, 0x00, 0x1e, 0x22 // sp + 16 + 144 * VG
-; GISEL-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; GISEL-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; GISEL-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
@@ -292,6 +280,18 @@ define <vscale x 4 x i32> @invoke_callee_may_throw_sve(<vscale x 4 x i32> %v) uw
; GISEL-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
; GISEL-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; GISEL-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; GISEL-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
; GISEL-NEXT: addvl sp, sp, #18
; GISEL-NEXT: .cfi_def_cfa wsp, 16
; GISEL-NEXT: .cfi_restore z8