aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AArch64
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen/AArch64')
-rw-r--r--llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir8
-rw-r--r--llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir4
-rw-r--r--llvm/test/CodeGen/AArch64/framelayout-split-sve.mir587
-rw-r--r--llvm/test/CodeGen/AArch64/framelayout-sve.mir12
-rw-r--r--llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir16
-rw-r--r--llvm/test/CodeGen/AArch64/spillfill-sve.mir10
-rw-r--r--llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll824
-rw-r--r--llvm/test/CodeGen/AArch64/stack-hazard.ll876
-rw-r--r--llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll10
-rw-r--r--llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll2854
-rw-r--r--llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll2
11 files changed, 4837 insertions, 366 deletions
diff --git a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir
index aca2816..7fd0cee 100644
--- a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir
+++ b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir
@@ -164,10 +164,10 @@ stack:
- { id: 1, name: z1.addr, size: 16, alignment: 16, stack-id: scalable-vector,
debug-info-variable: '!31', debug-info-expression: '!DIExpression()',
debug-info-location: '!32' }
- - { id: 2, name: p0.addr, size: 2, alignment: 2, stack-id: scalable-vector,
+ - { id: 2, name: p0.addr, size: 2, alignment: 2, stack-id: scalable-predicate-vector,
debug-info-variable: '!33', debug-info-expression: '!DIExpression()',
debug-info-location: '!34' }
- - { id: 3, name: p1.addr, size: 2, alignment: 2, stack-id: scalable-vector,
+ - { id: 3, name: p1.addr, size: 2, alignment: 2, stack-id: scalable-predicate-vector,
debug-info-variable: '!35', debug-info-expression: '!DIExpression()',
debug-info-location: '!36' }
- { id: 4, name: w0.addr, size: 4, alignment: 4, local-offset: -4, debug-info-variable: '!37',
@@ -181,10 +181,10 @@ stack:
- { id: 7, name: localv1, size: 16, alignment: 16, stack-id: scalable-vector,
debug-info-variable: '!45', debug-info-expression: '!DIExpression()',
debug-info-location: '!46' }
- - { id: 8, name: localp0, size: 2, alignment: 2, stack-id: scalable-vector,
+ - { id: 8, name: localp0, size: 2, alignment: 2, stack-id: scalable-predicate-vector,
debug-info-variable: '!48', debug-info-expression: '!DIExpression()',
debug-info-location: '!49' }
- - { id: 9, name: localp1, size: 2, alignment: 2, stack-id: scalable-vector,
+ - { id: 9, name: localp1, size: 2, alignment: 2, stack-id: scalable-predicate-vector,
debug-info-variable: '!51', debug-info-expression: '!DIExpression()',
debug-info-location: '!52' }
machineFunctionInfo: {}
diff --git a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir
index 0ea180b..41ba554 100644
--- a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir
+++ b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir
@@ -96,8 +96,8 @@ stack:
- { id: 1, size: 8, alignment: 8 }
- { id: 2, size: 16, alignment: 16, stack-id: scalable-vector }
- { id: 3, size: 16, alignment: 16, stack-id: scalable-vector }
- - { id: 4, size: 2, alignment: 2, stack-id: scalable-vector }
- - { id: 5, size: 2, alignment: 2, stack-id: scalable-vector }
+ - { id: 4, size: 2, alignment: 2, stack-id: scalable-predicate-vector }
+ - { id: 5, size: 2, alignment: 2, stack-id: scalable-predicate-vector }
machineFunctionInfo: {}
body: |
bb.0.entry:
diff --git a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
new file mode 100644
index 0000000..35eafe8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
@@ -0,0 +1,587 @@
+# RUN: llc -mattr=+sve -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -mtriple=aarch64-none-linux-gnu -run-pass=prologepilog %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -start-before=prologepilog %s -o - | FileCheck %s --check-prefix=ASM
+# RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -start-before=prologepilog %s -filetype=obj -o %t
+# RUN: llvm-objdump --dwarf=frames %t | FileCheck %s --check-prefix=UNWINDINFO
+# RUN: rm -rf %t
+#
+# Test allocation and deallocation of SVE objects on the stack with
+# split-sve-objects (and hazard padding) enabled. This also tests using a
+# combination of scalable and non-scalable offsets to access the SVE on the
+# stack.
+#
+# With split-sve-objects (which implies hazard padding) the SVE area is split
+# into PPR and ZPR areas with (fixed-size) hazard padding between them. The PPR
+# area holds all scalable predicate callee saves and locals, and the ZPR area
+# holds all scalable vector callee saves and locals. Additionally, any FPR
+# callee save is promoted to a ZPR callee save (to avoid needing additional
+# hazard padding in the callee save area).
+#
+# +-------------+
+# | stack arg |
+# +-------------+ <- SP before call
+# | Callee Saves|
+# | Frame record| (if available)
+# |-------------| <- FP (if available)
+# | PPR area |
+# |-------------|
+# |/////////////| hazard padding
+# |-------------|
+# | ZPR area |
+# +-------------+
+# | : |
+# | Stack objs |
+# | : |
+# +-------------+ <- SP after call and frame-setup
+#
+--- |
+
+ define void @test_allocate_split_sve() uwtable { entry: unreachable }
+ define void @test_allocate_split_sve_realigned() uwtable { entry: unreachable }
+ define void @test_address_split_sve() uwtable { entry: unreachable }
+ define void @test_address_split_sve_fp() uwtable { entry: unreachable }
+ define aarch64_sve_vector_pcs void @save_restore_ppr_zpr() uwtable { entry: unreachable }
+
+...
+---
+# +----------+
+# |scratchreg| // x29 is used as scratch reg.
+# |----------|
+# | %stack.0 | // scalable predicate of n * 12 bytes, aligned to 16 bytes
+# | | // to be materialized with 1*ADDVL (<=> n * 16 bytes)
+# |----------|
+# |//////////| // hazard padding (1024 bytes) -- part of PPR locals area
+# |//////////| // Note: This is currently not included in the "stackSize"
+# +----------+
+# | %stack.0 | // scalable SVE object of n * 18 bytes, aligned to 16 bytes,
+# | | // to be materialized with 2*ADDVL (<=> 2 * n * 16 bytes)
+# +----------+
+# |//////////| // hazard padding (1024 bytes)
+# |----------|
+# | %stack.1 | // not scalable
+# +----------+ <- SP
+
+# CHECK-LABEL: name: test_allocate_split_sve
+# CHECK: stackSize: 1056
+
+# CHECK: bb.0.entry:
+# CHECK: liveins: $z0, $p0, $fp
+# CHECK: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.4)
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22
+#
+# CHECK-NEXT: $x8 = ADDXri $sp, 1040, 0
+# CHECK-NEXT: $x8 = ADDPL_XXI $x8, 7, implicit $vg
+# CHECK-NEXT: STR_ZXI $z0, killed $x8, 0 :: (store (<vscale x 1 x s128>) into %stack.0)
+# CHECK-NEXT: $x8 = ADDXri $sp, 2064, 0
+# CHECK-NEXT: STR_PXI $p0, killed $x8, 18 :: (store (<vscale x 1 x s16>) into %stack.1)
+#
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1056
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.4)
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
+# CHECK-NEXT: RET_ReallyLR
+
+# ASM-LABEL: test_allocate_split_sve:
+# ASM: str x29, [sp, #-16]!
+# ASM-NEXT: .cfi_def_cfa_offset 16
+# ASM-NEXT: .cfi_offset w29, -16
+# ASM-NEXT: sub sp, sp, #1024
+# ASM-NEXT: .cfi_def_cfa_offset 1040
+# ASM-NEXT: addvl sp, sp, #-1
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG
+# ASM-NEXT: sub sp, sp, #1040
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+# ASM-NEXT: addvl sp, sp, #-2
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 2080 + 24 * VG
+#
+# ASM: addvl sp, sp, #2
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+# ASM-NEXT: add sp, sp, #1024
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1056 + 8 * VG
+# ASM-NEXT: addvl sp, sp, #1
+# ASM-NEXT: .cfi_def_cfa wsp, 1056
+# ASM-NEXT: add sp, sp, #1040
+# ASM-NEXT: .cfi_def_cfa_offset 16
+# ASM-NEXT: ldr x29, [sp], #16
+# ASM-NEXT: .cfi_def_cfa_offset 0
+# ASM-NEXT: .cfi_restore w29
+
+# UNWINDINFO: DW_CFA_def_cfa_offset: +16
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
+# UNWINDINFO: DW_CFA_def_cfa_offset: +1040
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus
+#
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1056, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa: reg31 +1056
+# UNWINDINFO: DW_CFA_def_cfa_offset: +16
+# UNWINDINFO: DW_CFA_def_cfa_offset: +0
+# UNWINDINFO-NEXT: DW_CFA_restore: reg29
+
+name: test_allocate_split_sve
+stack:
+ - { id: 0, stack-id: scalable-vector, size: 18, alignment: 2 }
+ - { id: 1, stack-id: scalable-vector, size: 12, alignment: 2 }
+ - { id: 2, stack-id: default, size: 16, alignment: 8 }
+body: |
+ bb.0.entry:
+ liveins: $z0, $p0
+ STR_ZXI $z0, %stack.0, 0 :: (store (<vscale x 1 x s128>) into %stack.0)
+ STR_PXI $p0, %stack.1, 0 :: (store (<vscale x 1 x s16>) into %stack.1)
+ RET_ReallyLR
+...
+---
+
+# Stack realignment is not supported with split-sve-objects, so we fallback to
+# the default hazard padding implementation. This does not prevent hazards
+# between ZPRs and PPRs (TODO: support this case).
+#
+# +----------+
+# | lr, fp | // frame record
+# |----------|
+# |//////////| // hazard padding (1024 bytes)
+# |----------|
+# | %stack.0 | // scalable predicate of n * 12 bytes, aligned to 16 bytes
+# | | // to be materialized with 1*ADDVL (<=> n * 16 bytes)
+# +----------+
+# | %stack.0 | // scalable SVE object of n * 18 bytes, aligned to 16 bytes,
+# | | // to be materialized with 2*ADDVL (<=> 2 * n * 16 bytes)
+# +----------+
+# |//////////| // hazard padding (1024 bytes)
+# |----------|
+# | %stack.1 | // not scalable
+# +----------+ <- SP
+
+name: test_allocate_split_sve_realigned
+stack:
+ - { id: 0, stack-id: scalable-vector, size: 18, alignment: 2 }
+ - { id: 1, stack-id: scalable-vector, size: 12, alignment: 2 }
+ - { id: 2, stack-id: default, size: 16, alignment: 32 }
+body: |
+ bb.0.entry:
+ liveins: $z0, $p0
+ STR_ZXI $z0, %stack.0, 0 :: (store (<vscale x 1 x s128>) into %stack.0)
+ STR_PXI $p0, %stack.1, 0 :: (store (<vscale x 1 x s16>) into %stack.1)
+ RET_ReallyLR
+
+# CHECK-LABEL: name: test_allocate_split_sve_realigned
+# CHECK: stackSize: 2080
+
+# CHECK: bb.0.entry:
+# CHECK: liveins: $z0, $p0, $lr
+# CHECK: $sp = frame-setup SUBXri $sp, 1040, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040
+# CHECK-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.5)
+# CHECK-NEXT: frame-setup STRXui killed $lr, $sp, 129 :: (store (s64) into %stack.4)
+# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 1024, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 1040, 0
+# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $[[TMP]], -2, implicit $vg
+# CHECK-NEXT: $sp = frame-setup ANDXri killed $x9, 7930
+#
+# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0
+# CHECK-NEXT: $x8 = ADDPL_XXI $x8, -1, implicit $vg
+# CHECK-NEXT: STR_ZXI $z0, killed $x8, -1 :: (store (<vscale x 1 x s128>) into %stack.0)
+# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0
+# CHECK-NEXT: STR_PXI $p0, killed $x8, -15 :: (store (<vscale x 1 x s16>) into %stack.1)
+#
+# CHECK-NEXT: $sp = frame-destroy SUBXri $fp, 1024, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1040
+# CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 129 :: (load (s64) from %stack.4)
+# CHECK-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.5)
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
+# CHECK-NEXT: RET_ReallyLR
+
+# ASM-LABEL: test_allocate_split_sve_realigned
+# ASM: sub sp, sp, #1040
+# ASM-NEXT: .cfi_def_cfa_offset 1040
+# ASM-NEXT: str x29, [sp, #1024]
+# ASM-NEXT: str x30, [sp, #1032]
+# ASM-NEXT: add x29, sp, #1024
+# ASM-NEXT: .cfi_def_cfa w29, 16
+# ASM-NEXT: .cfi_offset w30, -8
+# ASM-NEXT: .cfi_offset w29, -16
+#
+# ASM: sub sp, x29, #1024
+# ASM-NEXT: .cfi_def_cfa wsp, 1040
+# ASM-NEXT: ldr x30, [sp, #1032]
+# ASM-NEXT: ldr x29, [sp, #1024]
+# ASM-NEXT: add sp, sp, #1040
+# ASM-NEXT: .cfi_def_cfa_offset 0
+# ASM-NEXT: .cfi_restore w30
+# ASM-NEXT: .cfi_restore w29
+
+# UNWINDINFO: DW_CFA_def_cfa_offset: +1040
+# UNWINDINFO: DW_CFA_def_cfa: reg29 +16
+# UNWINDINFO-NEXT: DW_CFA_offset: reg30 -8
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
+#
+# UNWINDINFO: DW_CFA_def_cfa: reg31 +1040
+# UNWINDINFO: DW_CFA_def_cfa_offset: +0
+# UNWINDINFO-NEXT: DW_CFA_restore: reg30
+# UNWINDINFO-NEXT: DW_CFA_restore: reg29
+...
+---
+
+# +----------+
+# |scratchreg| // x29 is used as scratch reg.
+# +----------+
+# | %stack.2 | // scalable predicate @ SP + 2064b + 46 scalable bytes
+# |----------|
+# |//////////| // hazard padding (1024 bytes) -- part of PPR locals area
+# |//////////| // Note: This is currently not included in the "stackSize"
+# |----------|
+# | %stack.0 | // scalable vector @ SP + 1040b + 16 scalable bytes
+# | %stack.1 | // scalable vector @ SP + 1040b
+# +----------+
+# |//////////| // hazard padding (1024 bytes)
+# |----------|
+# | %stack.3 | // not scalable
+# +----------+ <- SP
+
+# CHECK-LABEL: name: test_address_split_sve
+# CHECK: stackSize: 1056
+
+# CHECK: bb.0.entry:
+# CHECK-NEXT: liveins:
+# CHECK-NEXT: {{ $}}
+# CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.5)
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22
+#
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 1040, 0
+# CHECK-NEXT: STR_ZXI $z0, killed $[[TMP]], 1
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 1040, 0
+# CHECK-NEXT: STR_ZXI $z1, killed $[[TMP]], 0
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 2064, 0
+# CHECK-NEXT: STR_PXI $p0, killed $[[TMP]], 23
+#
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1056
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.5)
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
+# CHECK-NEXT: RET_ReallyLR
+
+# ASM-LABEL: test_address_split_sve
+# ASM: str x29, [sp, #-16]!
+# ASM-NEXT: .cfi_def_cfa_offset 16
+# ASM-NEXT: .cfi_offset w29, -16
+# ASM-NEXT: sub sp, sp, #1024
+# ASM-NEXT: .cfi_def_cfa_offset 1040
+# ASM-NEXT: addvl sp, sp, #-1
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG
+# ASM-NEXT: sub sp, sp, #1040
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+# ASM-NEXT: addvl sp, sp, #-2
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 2080 + 24 * VG
+#
+# ASM: addvl sp, sp, #2
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+# ASM-NEXT: add sp, sp, #1024
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1056 + 8 * VG
+# ASM-NEXT: addvl sp, sp, #1
+# ASM-NEXT: .cfi_def_cfa wsp, 1056
+# ASM-NEXT: add sp, sp, #1040
+# ASM-NEXT: .cfi_def_cfa_offset 16
+# ASM-NEXT: ldr x29, [sp], #16
+# ASM-NEXT: .cfi_def_cfa_offset 0
+# ASM-NEXT: .cfi_restore w29
+
+# UNWINDINFO: DW_CFA_def_cfa_offset: +16
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
+# UNWINDINFO: DW_CFA_def_cfa_offset: +1040
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus
+#
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1056, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa: reg31 +1056
+# UNWINDINFO: DW_CFA_def_cfa_offset: +16
+# UNWINDINFO: DW_CFA_def_cfa_offset: +0
+# UNWINDINFO-NEXT: DW_CFA_restore: reg29
+
+name: test_address_split_sve
+frameInfo:
+ maxAlignment: 16
+stack:
+ - { id: 0, stack-id: scalable-vector, size: 16, alignment: 8 }
+ - { id: 1, stack-id: scalable-vector, size: 16, alignment: 8 }
+ - { id: 2, stack-id: scalable-vector, size: 2, alignment: 2 }
+ - { id: 3, stack-id: default, size: 16, alignment: 8 }
+body: |
+ bb.0.entry:
+ liveins: $z0, $z1, $p0
+
+ STR_ZXI $z0, %stack.0, 0 :: (store (<vscale x 1 x s128>) into %stack.0)
+ STR_ZXI $z1, %stack.1, 0 :: (store (<vscale x 1 x s128>) into %stack.1)
+ STR_PXI $p0, %stack.2, 0 :: (store (<vscale x 1 x s16>) into %stack.2)
+
+ RET_ReallyLR
+...
+---
+# +----------+
+# | lr, fp | // frame record
+# +----------+ <- FP
+# | %stack.2 | // scalable predicate @ FP - 2 scalable bytes
+# |----------|
+# |//////////| // hazard padding (1024 bytes) -- part of PPR locals area
+# |//////////| // Note: This is currently not included in the "stackSize"
+# |----------|
+# | %stack.0 | // scalable vector @ FP - 1024b - 32 scalable bytes
+# | %stack.1 | // scalable vector @ FP - 1024b - 48 scalable bytes
+# +----------+
+# |//////////| // hazard padding (1024 bytes)
+# |----------|
+# | %stack.3 | // not scalable
+# +----------+ <- SP
+
+# CHECK-LABEL: name: test_address_split_sve_fp
+# CHECK: stackSize: 1056
+#
+# CHECK: bb.0.entry:
+# CHECK-NEXT: liveins:
+# CHECK-NEXT: {{ $}}
+# CHECK-NEXT: early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.6), (store (s64) into %stack.5)
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+#
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = SUBXri $fp, 1024, 0
+# CHECK-NEXT: STR_ZXI $z0, killed $[[TMP]], -2
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = SUBXri $fp, 1024, 0
+# CHECK-NEXT: STR_ZXI $z1, killed $[[TMP]], -3
+# CHECK-NEXT: STR_PXI $p0, $fp, -1
+#
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
+# CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.6), (load (s64) from %stack.5)
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
+# CHECK-NEXT: RET_ReallyLR
+
+# ASM-LABEL: test_address_split_sve_fp
+# ASM: stp x29, x30, [sp, #-16]!
+# ASM-NEXT: .cfi_def_cfa_offset 16
+# ASM-NEXT: mov x29, sp
+# ASM-NEXT: .cfi_def_cfa w29, 16
+# ASM-NEXT: .cfi_offset w30, -8
+# ASM-NEXT: .cfi_offset w29, -16
+# ASM-NEXT: sub sp, sp, #1024
+# ASM-NEXT: addvl sp, sp, #-1
+# ASM-NEXT: sub sp, sp, #1040
+# ASM-NEXT: addvl sp, sp, #-2
+#
+# ASM: addvl sp, sp, #2
+# ASM-NEXT: add sp, sp, #1024
+# ASM-NEXT: addvl sp, sp, #1
+# ASM-NEXT: add sp, sp, #1040
+# ASM-NEXT: .cfi_def_cfa wsp, 16
+# ASM-NEXT: ldp x29, x30, [sp], #16
+# ASM-NEXT: .cfi_def_cfa_offset 0
+# ASM-NEXT: .cfi_restore w30
+# ASM-NEXT: .cfi_restore w29
+
+# UNWINDINFO: DW_CFA_def_cfa_offset: +16
+# UNWINDINFO: DW_CFA_def_cfa: reg29 +16
+# UNWINDINFO-NEXT: DW_CFA_offset: reg30 -8
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
+#
+# UNWINDINFO: DW_CFA_def_cfa: reg31 +16
+# UNWINDINFO: DW_CFA_def_cfa_offset: +0
+# UNWINDINFO-NEXT: DW_CFA_restore: reg30
+# UNWINDINFO-NEXT: DW_CFA_restore: reg29
+
+name: test_address_split_sve_fp
+frameInfo:
+ maxAlignment: 16
+ isFrameAddressTaken: true
+stack:
+ - { id: 0, stack-id: scalable-vector, size: 16, alignment: 8 }
+ - { id: 1, stack-id: scalable-vector, size: 16, alignment: 8 }
+ - { id: 2, stack-id: scalable-vector, size: 2, alignment: 2 }
+ - { id: 3, stack-id: default, size: 16, alignment: 8 }
+body: |
+ bb.0.entry:
+ liveins: $z0, $z1, $p0
+
+ STR_ZXI $z0, %stack.0, 0 :: (store (<vscale x 1 x s128>) into %stack.0)
+ STR_ZXI $z1, %stack.1, 0 :: (store (<vscale x 1 x s128>) into %stack.1)
+ STR_PXI $p0, %stack.2, 0 :: (store (<vscale x 1 x s16>) into %stack.2)
+
+ RET_ReallyLR
+...
+---
+# CHECK-LABEL: name: save_restore_ppr_zpr
+# CHECK: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.8)
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: frame-setup STR_PXI killed $p6, $sp, 5 :: (store (s16) into %stack.7)
+# CHECK-NEXT: frame-setup STR_PXI killed $p5, $sp, 6 :: (store (s16) into %stack.6)
+# CHECK-NEXT: frame-setup STR_PXI killed $p4, $sp, 7 :: (store (s16) into %stack.5)
+#
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+#
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3, implicit $vg
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0a, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22
+# CHECK-NEXT: frame-setup STR_ZXI killed $z10, $sp, 0 :: (store (s128) into %stack.4)
+# CHECK-NEXT: frame-setup STR_ZXI killed $z9, $sp, 1 :: (store (s128) into %stack.3)
+# CHECK-NEXT: frame-setup STR_ZXI killed $z8, $sp, 2 :: (store (s128) into %stack.2)
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1056, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0a, 0x8f, 0xb0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22
+#
+#
+# CHECK: $sp = frame-destroy ADDXri $sp, 1056, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0a, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22
+# CHECK-NEXT: $z10 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.4)
+# CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.3)
+# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.2)
+#
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22
+#
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z9
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z10
+# CHECK-NEXT: $p6 = frame-destroy LDR_PXI $sp, 5 :: (load (s16) from %stack.7)
+# CHECK-NEXT: $p5 = frame-destroy LDR_PXI $sp, 6 :: (load (s16) from %stack.6)
+# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 7 :: (load (s16) from %stack.5)
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
+# CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.8)
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
+# CHECK-NEXT: RET_ReallyLR
+
+# ASM-LABEL: save_restore_ppr_zpr:
+# ASM: str x29, [sp, #-16]!
+# ASM-NEXT: .cfi_def_cfa_offset 16
+# ASM-NEXT: .cfi_offset w29, -16
+# ASM-NEXT: addvl sp, sp, #-1
+# ASM-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
+# ASM-NEXT: str p6, [sp, #5, mul vl]
+# ASM-NEXT: str p5, [sp, #6, mul vl]
+# ASM-NEXT: str p4, [sp, #7, mul vl]
+# ASM-NEXT: sub sp, sp, #1024
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG
+# ASM-NEXT: addvl sp, sp, #-3
+# ASM-NEXT: .cfi_escape 0x0f, 0x0a, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 // sp + 1040 + 32 * VG
+# ASM-NEXT: str z10, [sp]
+# ASM-NEXT: str z9, [sp, #1, mul vl]
+# ASM-NEXT: str z8, [sp, #2, mul vl]
+# ASM-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1040
+# ASM-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d9 @ cfa - 24 * VG - 1040
+# ASM-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d10 @ cfa - 32 * VG - 1040
+# ASM-NEXT: sub sp, sp, #1056
+# ASM-NEXT: .cfi_escape 0x0f, 0x0a, 0x8f, 0xb0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 // sp + 2096 + 32 * VG
+#
+# ASM: add sp, sp, #1056
+# ASM-NEXT: .cfi_escape 0x0f, 0x0a, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 // sp + 1040 + 32 * VG
+# ASM-NEXT: ldr z10, [sp]
+# ASM-NEXT: ldr z9, [sp, #1, mul vl]
+# ASM-NEXT: ldr z8, [sp, #2, mul vl]
+# ASM-NEXT: add sp, sp, #1024
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 // sp + 16 + 32 * VG
+# ASM-NEXT: addvl sp, sp, #3
+# ASM-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
+# ASM-NEXT: .cfi_restore z8
+# ASM-NEXT: .cfi_restore z9
+# ASM-NEXT: .cfi_restore z10
+# ASM-NEXT: ldr p6, [sp, #5, mul vl]
+# ASM-NEXT: ldr p5, [sp, #6, mul vl]
+# ASM-NEXT: ldr p4, [sp, #7, mul vl]
+# ASM-NEXT: addvl sp, sp, #1
+# ASM-NEXT: .cfi_def_cfa wsp, 16
+# ASM-NEXT: ldr x29, [sp], #16
+# ASM-NEXT: .cfi_def_cfa_offset 0
+# ASM-NEXT: .cfi_restore w29
+
+# UNWINDINFO: DW_CFA_def_cfa_offset: +16
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_consts +32, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_expression: reg72 DW_OP_bregx 0x2e +0, DW_OP_consts -16, DW_OP_mul, DW_OP_plus, DW_OP_consts -1040, DW_OP_plus
+# UNWINDINFO: DW_CFA_expression: reg73 DW_OP_bregx 0x2e +0, DW_OP_consts -24, DW_OP_mul, DW_OP_plus, DW_OP_consts -1040, DW_OP_plus
+# UNWINDINFO: DW_CFA_expression: reg74 DW_OP_bregx 0x2e +0, DW_OP_consts -32, DW_OP_mul, DW_OP_plus, DW_OP_consts -1040, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2096, DW_OP_bregx 0x2e +0, DW_OP_consts +32, DW_OP_mul, DW_OP_plus
+#
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_consts +32, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_consts +32, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_restore_extended: reg104
+# UNWINDINFO-NEXT: DW_CFA_restore_extended: reg105
+# UNWINDINFO-NEXT: DW_CFA_restore_extended: reg106
+# UNWINDINFO: DW_CFA_def_cfa: reg31 +16
+# UNWINDINFO: DW_CFA_def_cfa_offset: +0
+# UNWINDINFO-NEXT: DW_CFA_restore: reg29
+
+name: save_restore_ppr_zpr
+stack:
+ - { id: 0, stack-id: default, size: 32, alignment: 16 }
+body: |
+ bb.0.entry:
+
+ $p4 = IMPLICIT_DEF
+ $p5 = IMPLICIT_DEF
+ $p6 = IMPLICIT_DEF
+ $z8 = IMPLICIT_DEF
+ $z9 = IMPLICIT_DEF
+ $z10 = IMPLICIT_DEF
+
+ RET_ReallyLR
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
index 03a6aab..1101416 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
@@ -1215,19 +1215,19 @@ body: |
# CHECK: - { id: 2, name: '', type: default, offset: -112, size: 16, alignment: 16,
# CHECK-NEXT: stack-id: scalable-vector,
# CHECK: - { id: 3, name: '', type: default, offset: -114, size: 2, alignment: 2,
-# CHECK-NEXT: stack-id: scalable-vector,
+# CHECK-NEXT: stack-id: scalable-predicate-vector,
# CHECK: - { id: 4, name: '', type: spill-slot, offset: -144, size: 16, alignment: 16,
# CHECK-NEXT: stack-id: scalable-vector,
# CHECK: - { id: 5, name: '', type: spill-slot, offset: -146, size: 2, alignment: 2,
-# CHECK-NEXT: stack-id: scalable-vector,
+# CHECK-NEXT: stack-id: scalable-predicate-vector,
# CHECK: - { id: 6, name: '', type: spill-slot, offset: -16, size: 16, alignment: 16,
# CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '$z8',
# CHECK: - { id: 7, name: '', type: spill-slot, offset: -32, size: 16, alignment: 16,
# CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '$z23',
# CHECK: - { id: 8, name: '', type: spill-slot, offset: -34, size: 2, alignment: 2,
-# CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '$p4',
+# CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: '$p4',
# CHECK: - { id: 9, name: '', type: spill-slot, offset: -36, size: 2, alignment: 2,
-# CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '$p15',
+# CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: '$p15',
# CHECK: - { id: 10, name: '', type: spill-slot, offset: -16, size: 8, alignment: 16,
# CHECK-NEXT: stack-id: default, callee-saved-register: '$fp',
#
@@ -1295,9 +1295,9 @@ stack:
- { id: 0, type: default, size: 32, alignment: 16, stack-id: scalable-vector }
- { id: 1, type: default, size: 4, alignment: 2, stack-id: scalable-vector }
- { id: 2, type: default, size: 16, alignment: 16, stack-id: scalable-vector }
- - { id: 3, type: default, size: 2, alignment: 2, stack-id: scalable-vector }
+ - { id: 3, type: default, size: 2, alignment: 2, stack-id: scalable-predicate-vector }
- { id: 4, type: spill-slot, size: 16, alignment: 16, stack-id: scalable-vector }
- - { id: 5, type: spill-slot, size: 2, alignment: 2, stack-id: scalable-vector }
+ - { id: 5, type: spill-slot, size: 2, alignment: 2, stack-id: scalable-predicate-vector }
body: |
bb.0.entry:
diff --git a/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir b/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
index bff0cac..0298168 100644
--- a/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
+++ b/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
@@ -983,26 +983,22 @@ body: |
; EXPAND-LABEL: name: zpr_predicate_spill_p4_saved
; EXPAND: liveins: $p0, $p1, $p2, $p3, $fp, $p8, $p4
; EXPAND-NEXT: {{ $}}
- ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
- ; EXPAND-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.3)
+ ; EXPAND-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.2)
; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
- ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.2)
+ ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.1)
; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
- ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.1)
- ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+ ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.0)
;
; EXPAND-NEXT: $p8 = IMPLICIT_DEF
;
- ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
- ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.2)
+ ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.1)
; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
- ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.1)
+ ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.0)
; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
- ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.3)
- ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+ ; EXPAND-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.2)
; EXPAND-NEXT: RET undef $lr, implicit $p0, implicit $p1, implicit $p2, implicit $p3
; If we spill a register above p8, p4 must also be saved, so we can guarantee
diff --git a/llvm/test/CodeGen/AArch64/spillfill-sve.mir b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
index 2b16dd0f..5569175 100644
--- a/llvm/test/CodeGen/AArch64/spillfill-sve.mir
+++ b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
@@ -39,7 +39,7 @@ body: |
; CHECK-LABEL: name: spills_fills_stack_id_ppr
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2
- ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: ''
+ ; CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: ''
; EXPAND-LABEL: name: spills_fills_stack_id_ppr
; EXPAND: STR_PXI $p0, $sp, 7
@@ -82,7 +82,7 @@ body: |
; CHECK-LABEL: name: spills_fills_stack_id_ppr2
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 2
- ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: ''
+ ; CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: ''
; EXPAND-LABEL: name: spills_fills_stack_id_ppr2
; EXPAND: STR_PXI $p0, $sp, 6
@@ -127,7 +127,7 @@ body: |
; CHECK-LABEL: name: spills_fills_stack_id_ppr2
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 2
- ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: ''
+ ; CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: ''
; EXPAND-LABEL: name: spills_fills_stack_id_ppr2mul2
; EXPAND: STR_PXI $p0, $sp, 6
@@ -172,7 +172,7 @@ body: |
; CHECK-LABEL: name: spills_fills_stack_id_pnr
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2
- ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: ''
+ ; CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: ''
; EXPAND-LABEL: name: spills_fills_stack_id_pnr
; EXPAND: STR_PXI $pn0, $sp, 7
@@ -211,7 +211,7 @@ body: |
; CHECK-LABEL: name: spills_fills_stack_id_virtreg_pnr
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2
- ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: ''
+ ; CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: ''
; EXPAND-LABEL: name: spills_fills_stack_id_virtreg_pnr
; EXPAND: renamable $pn8 = WHILEGE_CXX_B
diff --git a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
new file mode 100644
index 0000000..690a39d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
@@ -0,0 +1,824 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -pass-remarks-analysis=stack-frame-layout 2>&1 >/dev/null | FileCheck %s --check-prefixes=CHECK-FRAMELAYOUT
+
+; CHECK-FRAMELAYOUT-LABEL: Function: zpr_and_ppr_local
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024
+
+; <GPRs>
+; %ppr_local sp+2048+30*vscale (= #15, mul vl for str/ldr PPR)
+; 14 x vscale bytes of padding sp+2048+16*vscale
+; <hazard padding> sp+1024+16*vscale
+; %zpr_local sp+1024
+; <hazard padding>
+; -> sp
+define void @zpr_and_ppr_local(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: zpr_and_ppr_local:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2064 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: add x8, sp, #2048
+; CHECK-NEXT: str p0, [x8, #15, mul vl]
+; CHECK-NEXT: add x8, sp, #1024
+; CHECK-NEXT: str z0, [x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %ppr_local = alloca <vscale x 16 x i1>
+ %zpr_local = alloca <vscale x 16 x i8>
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+ ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: zpr_and_ppr_local_fp
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024
+
+; <GPRs>
+; -> fp
+; %ppr_local fp-2*vscale (= #-1, mul vl for str/ldr PPR)
+; 14 x vscale bytes of padding fp-16*vscale
+; <hazard padding> fp-1024-16*vscale
+; %zpr_local fp-1024-32*vscale (= #-2, mul vl for str/ldr ZPR)
+; <hazard padding>
+; -> sp
+define void @zpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) "aarch64_pstate_sm_compatible" "frame-pointer"="all" {
+; CHECK-LABEL: zpr_and_ppr_local_fp:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: .cfi_def_cfa w29, 16
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: sub x8, x29, #1024
+; CHECK-NEXT: str p0, [x29, #-1, mul vl]
+; CHECK-NEXT: str z0, [x8, #-2, mul vl]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %ppr_local = alloca <vscale x 16 x i1>
+ %zpr_local = alloca <vscale x 16 x i8>
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+ ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: fpr_and_ppr_local
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1048-16 x vscale], Type: Variable, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2080-16 x vscale], Type: Variable, Align: 16, Size: 1024
+
+; <GPRs>
+; %ppr_local sp+2064+14*vscale (= #7, mul vl for str/ldr PPR)
+; 14 x vscale bytes of padding sp+2064
+; <hazard padding> sp+1040
+; %fpr_local sp+1032
+; 8 bytes of padding sp+1024
+; <hazard padding>
+; -> sp
+define void @fpr_and_ppr_local(<vscale x 16 x i1> %pred, double %double) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: fpr_and_ppr_local:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: sub sp, sp, #1040
+; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: add x8, sp, #2064
+; CHECK-NEXT: str p0, [x8, #7, mul vl]
+; CHECK-NEXT: str d0, [sp, #1032]
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1040
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %ppr_local = alloca <vscale x 16 x i1>
+ %fpr_local = alloca double
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile double %double, ptr %fpr_local
+ ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: fpr_and_ppr_local_fp
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1048-16 x vscale], Type: Variable, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2080-16 x vscale], Type: Variable, Align: 16, Size: 1024
+
+; <GPRs>
+; -> fp
+; %ppr_local fp-2*vscale (= #-1, mul vl for str/ldr PPR)
+; 14 x vscale bytes of padding
+; <hazard padding>
+; %fpr_local sp+1032
+; 8 bytes of padding sp+1024
+; <hazard padding>
+; -> sp
+define void @fpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, double %double) "aarch64_pstate_sm_compatible" "frame-pointer"="all" {
+; CHECK-LABEL: fpr_and_ppr_local_fp:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: sub sp, sp, #1040
+; CHECK-NEXT: .cfi_def_cfa w29, 16
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: str p0, [x29, #-1, mul vl]
+; CHECK-NEXT: str d0, [sp, #1032]
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1040
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %ppr_local = alloca <vscale x 16 x i1>
+ %fpr_local = alloca double
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile double %double, ptr %fpr_local
+ ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: gpr_and_ppr_local
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2072-32 x vscale], Type: Variable, Align: 8, Size: 8
+
+; <CS GPRs>
+; %ppr_local sp+2064+30*vscale (= #15, mul vl for str/ldr PPR)
+; 14 x vscale bytes of padding
+; <hazard padding> sp+1040+16*vscale
+; <fpr callee save: z8> sp+1040
+; <hazard padding> sp+16
+; %gpr_local sp+8
+; 8 bytes of padding
+; -> sp
+define void @gpr_and_ppr_local(<vscale x 16 x i1> %pred, i64 %int) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: gpr_and_ppr_local:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1040
+; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2080 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1040
+; CHECK-NEXT: add x8, sp, #2064
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: str p0, [x8, #15, mul vl]
+; CHECK-NEXT: str x0, [sp, #8]
+; CHECK-NEXT: add sp, sp, #1040
+; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ tail call void asm sideeffect "", "~{d8}"() #1 ; Spill an FPR so hazard padding is needed
+ %ppr_local = alloca <vscale x 16 x i1>
+ %gpr_local = alloca i64
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile i64 %int, ptr %gpr_local
+ ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: gpr_and_ppr_local_fp
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2072-32 x vscale], Type: Variable, Align: 8, Size: 8
+
+; <CS GPRs>
+; -> fp
+; %ppr_local fp-2*vscale (= #-1, mul vl for str/ldr PPR)
+; 14 x vscale bytes of padding
+; <hazard padding>
+; <fpr callee save: z8>
+; <hazard padding>
+; %gpr_local sp+8
+; 8 bytes of padding
+; -> sp
+define void @gpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, i64 %int) "aarch64_pstate_sm_compatible" "frame-pointer"="all" {
+; CHECK-LABEL: gpr_and_ppr_local_fp:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1040
+; CHECK-NEXT: .cfi_def_cfa w29, 16
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1040
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: str p0, [x29, #-1, mul vl]
+; CHECK-NEXT: str x0, [sp, #8]
+; CHECK-NEXT: add sp, sp, #1040
+; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ tail call void asm sideeffect "", "~{d8}"() #1 ; Spill an FPR so hazard padding is needed
+ %ppr_local = alloca <vscale x 16 x i1>
+ %gpr_local = alloca i64
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile i64 %int, ptr %gpr_local
+ ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: all_stack_areas
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-4 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-6 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-8 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-10 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-12 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-14 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-16 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-18 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-20 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-22 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-24 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-34 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-64 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-80 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-96 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-112 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-128 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-144 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-160 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-176 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-192 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-208 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-224 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-240 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-256 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-272 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-288 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-304 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-320 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1048-320 x vscale], Type: Variable, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2080-320 x vscale], Type: Variable, Align: 16, Size: 1024
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2088-320 x vscale], Type: Variable, Align: 8, Size: 8
+
+; <CS GPRs>
+; <CS PPRs>
+; %ppr_local sp+2080+286*vscale (addvl #17, addpl #7)
+; 14 * vscale bytes of padding sp+2080+272*vscale
+; <hazard padding> sp+1056+272*vscale
+; <CS ZPRs> sp+1056+16*vscale
+; %zpr_local sp+1056
+; %fpr_local sp+1048
+; 8 bytes of padding sp+1040
+; <hazard padding> sp+16
+; %gpr_local sp+8
+; 8 bytes of padding sp
+; -> sp
+define void @all_stack_areas(<vscale x 16 x i1> %pred, double %fp) {
+; CHECK-LABEL: all_stack_areas:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-17
+; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1056
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0b, 0x8f, 0xb0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0xa0, 0x01, 0x1e, 0x22 // sp + 2096 + 160 * VG
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 32 * VG - 1040
+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d9 @ cfa - 40 * VG - 1040
+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d10 @ cfa - 48 * VG - 1040
+; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d11 @ cfa - 56 * VG - 1040
+; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d12 @ cfa - 64 * VG - 1040
+; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d13 @ cfa - 72 * VG - 1040
+; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d14 @ cfa - 80 * VG - 1040
+; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xa8, 0x7f, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d15 @ cfa - 88 * VG - 1040
+; CHECK-NEXT: add x0, sp, #2080
+; CHECK-NEXT: add x8, sp, #2080
+; CHECK-NEXT: add x1, sp, #1056
+; CHECK-NEXT: addvl x0, x0, #17
+; CHECK-NEXT: add x2, sp, #1048
+; CHECK-NEXT: add x3, sp, #8
+; CHECK-NEXT: addpl x0, x0, #7
+; CHECK-NEXT: str d0, [sp, #1048]
+; CHECK-NEXT: str p0, [x8, #143, mul vl]
+; CHECK-NEXT: bl foo
+; CHECK-NEXT: add sp, sp, #1056
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #17
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %ppr_local = alloca <vscale x 16 x i1>
+ %zpr_local = alloca <vscale x 16 x i8>
+ %fpr_local = alloca double
+ ; // Needed to sort %fpr_local into the FPR region
+ store double %fp, ptr %fpr_local
+ ; // Needed to sort %ppr_local into the PPR region
+ store <vscale x 16 x i1> %pred, ptr %ppr_local
+ %gpr_local = alloca i64
+ call void @foo(ptr %ppr_local, ptr %zpr_local, ptr %fpr_local, ptr %gpr_local)
+ ret void
+}
+declare void @foo(ptr, ptr, ptr, ptr)
+
+; CHECK-FRAMELAYOUT-LABEL: Function: all_stack_areas_fp
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-24], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-2 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-4 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-6 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-8 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-10 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-12 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-14 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-16 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-18 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-20 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-22 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-24 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-34 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-64 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-80 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-96 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-112 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-128 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-144 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-160 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-176 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-192 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-208 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-224 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-240 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-256 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-272 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-288 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-304 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-320 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1064-320 x vscale], Type: Variable, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2096-320 x vscale], Type: Variable, Align: 16, Size: 1024
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2104-320 x vscale], Type: Variable, Align: 8, Size: 8
+
+; <CS GPRs>
+; -> fp
+; <CS PPRs> fp-32*vscale
+; %ppr_local fp-34*vscale (addpl #-17)
+; 14 * vscale bytes of padding fp-48*vscale
+; <hazard padding> fp-1024-48*vscale
+; <CS ZPRs> fp-1024-304*vscale
+; %zpr_local sp-1024-320*vscale (addvl #-20)
+; %fpr_local sp+1048
+; 8 bytes of padding sp+1040
+; <hazard padding> sp+16
+; %gpr_local sp+8
+; 8 bytes of padding sp
+; -> sp
+define void @all_stack_areas_fp(<vscale x 16 x i1> %pred, double %fp) "frame-pointer"="all" {
+; CHECK-LABEL: all_stack_areas_fp:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x28, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-17
+; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1056
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: .cfi_def_cfa w29, 32
+; CHECK-NEXT: .cfi_offset w28, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 32 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 40 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 48 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d11 @ cfa - 56 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d12 @ cfa - 64 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d13 @ cfa - 72 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d14 @ cfa - 80 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xa8, 0x7f, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d15 @ cfa - 88 * VG - 1056
+; CHECK-NEXT: sub x1, x29, #1024
+; CHECK-NEXT: addpl x0, x29, #-17
+; CHECK-NEXT: add x2, sp, #1048
+; CHECK-NEXT: addvl x1, x1, #-20
+; CHECK-NEXT: add x3, sp, #8
+; CHECK-NEXT: str d0, [sp, #1048]
+; CHECK-NEXT: str p0, [x29, #-17, mul vl]
+; CHECK-NEXT: bl foo
+; CHECK-NEXT: add sp, sp, #1056
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #17
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x28, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %ppr_local = alloca <vscale x 16 x i1>
+ %zpr_local = alloca <vscale x 16 x i8>
+ %fpr_local = alloca double
+ ; // Needed to sort %fpr_local into the FPR region
+ store double %fp, ptr %fpr_local
+ ; // Needed to sort %ppr_local into the PPR region
+ store <vscale x 16 x i1> %pred, ptr %ppr_local
+ %gpr_local = alloca i64
+ call void @foo(ptr %ppr_local, ptr %zpr_local, ptr %fpr_local, ptr %gpr_local)
+ ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: svecc_call
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-24], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-56], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-2 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-4 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-6 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-8 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-10 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-12 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-14 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-16 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-18 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-20 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-22 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-24 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-48 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-64 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-80 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-96 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-112 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-128 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-144 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-160 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-176 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-192 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-208 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-224 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-240 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-256 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-272 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-288 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2112-288 x vscale], Type: Variable, Align: 16, Size: 1024
+
+define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: svecc_call:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: .cfi_def_cfa w29, 64
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w26, -16
+; CHECK-NEXT: .cfi_offset w27, -24
+; CHECK-NEXT: .cfi_offset w28, -32
+; CHECK-NEXT: .cfi_offset vg, -48
+; CHECK-NEXT: .cfi_offset w30, -56
+; CHECK-NEXT: .cfi_offset w29, -64
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-16
+; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088
+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088
+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088
+; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088
+; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088
+; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088
+; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088
+; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: mov x8, x0
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: mov x19, x0
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: tbz w19, #0, .LBB8_2
+; CHECK-NEXT: // %bb.1: // %entry
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB8_2: // %entry
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: mov w1, #45 // =0x2d
+; CHECK-NEXT: mov w2, #37 // =0x25
+; CHECK-NEXT: bl memset
+; CHECK-NEXT: tbz w19, #0, .LBB8_4
+; CHECK-NEXT: // %bb.3: // %entry
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .LBB8_4: // %entry
+; CHECK-NEXT: mov w0, #22647 // =0x5877
+; CHECK-NEXT: movk w0, #59491, lsl #16
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #16
+; CHECK-NEXT: .cfi_restore z8
+; CHECK-NEXT: .cfi_restore z9
+; CHECK-NEXT: .cfi_restore z10
+; CHECK-NEXT: .cfi_restore z11
+; CHECK-NEXT: .cfi_restore z12
+; CHECK-NEXT: .cfi_restore z13
+; CHECK-NEXT: .cfi_restore z14
+; CHECK-NEXT: .cfi_restore z15
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: .cfi_def_cfa wsp, 64
+; CHECK-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w19
+; CHECK-NEXT: .cfi_restore w26
+; CHECK-NEXT: .cfi_restore w27
+; CHECK-NEXT: .cfi_restore w28
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: ret
+entry:
+ tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
+ %call = call ptr @memset(ptr noundef nonnull %P1, i32 noundef 45, i32 noundef 37)
+ ret i32 -396142473
+}
+declare ptr @memset(ptr, i32, i32)
+
+; FIXME: aarch64-split-sve-objects is currently not supported in this function
+; as it requires stack reealignment (for the 32-byte aligned alloca).
+; GPR CSRs
+; <hazard padding>
+; FPR CSRs
+; <hazrd padding>
+; <SVE locals (PPRs and ZPRs)> <--- hazard between PPRs and ZPRs here!
+; <realignment padding>
+; -> sp
+define void @zpr_and_ppr_local_realignment(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector, i64 %gpr) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: zpr_and_ppr_local_realignment:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #1040
+; CHECK-NEXT: sub x9, sp, #1040
+; CHECK-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #1024
+; CHECK-NEXT: addvl x9, x9, #-2
+; CHECK-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0
+; CHECK-NEXT: .cfi_def_cfa w29, 16
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: sub x8, x29, #1024
+; CHECK-NEXT: str p0, [x8, #-1, mul vl]
+; CHECK-NEXT: str z0, [x8, #-2, mul vl]
+; CHECK-NEXT: str x0, [sp]
+; CHECK-NEXT: sub sp, x29, #1024
+; CHECK-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1040
+; CHECK-NEXT: ret
+ %ppr_local = alloca <vscale x 16 x i1>
+ %zpr_local = alloca <vscale x 16 x i8>
+ %gpr_local = alloca i64, align 32
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+ store volatile i64 %gpr, ptr %gpr_local
+ ret void
+}
+
+define void @zpr_and_ppr_local_stack_probing(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector, i64 %gpr)
+; CHECK-LABEL: zpr_and_ppr_local_stack_probing:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: sub sp, sp, #1824
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xb0, 0x16, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2864 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: add x8, sp, #2848
+; CHECK-NEXT: str p0, [x8, #15, mul vl]
+; CHECK-NEXT: add x8, sp, #1824
+; CHECK-NEXT: str z0, [x8]
+; CHECK-NEXT: str x0, [sp]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1824
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ "probe-stack"="inline-asm" "stack-probe-size"="4096" "frame-pointer"="none" "aarch64_pstate_sm_compatible"
+{
+ %ppr_local = alloca <vscale x 16 x i1>
+ %zpr_local = alloca <vscale x 16 x i8>
+ %gpr_local = alloca i64, i64 100, align 8
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+ store volatile i64 %gpr, ptr %gpr_local
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll
index 5f52280..333a8be 100644
--- a/llvm/test/CodeGen/AArch64/stack-hazard.ll
+++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll
@@ -1,7 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=0 | FileCheck %s --check-prefixes=CHECK,CHECK0
; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=64 | FileCheck %s --check-prefixes=CHECK,CHECK64
-; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-NOSPLITSVE
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-split-sve-objects -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-SPLITSVE
define i32 @basic(i32 noundef %num) {
; CHECK-LABEL: basic:
@@ -1503,72 +1504,24 @@ define [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1([2 x <vscale x 4 x i1>
}
define [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1_caller([2 x <vscale x 4 x i1>] %arg1, [2 x <vscale x 4 x i1>] %arg2) nounwind "aarch64_pstate_sm_compatible" {
-; CHECK0-LABEL: sve_signature_pred_2xv4i1_caller:
-; CHECK0: // %bb.0:
-; CHECK0-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK0-NEXT: addvl sp, sp, #-1
-; CHECK0-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT: mov p5.b, p0.b
-; CHECK0-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT: mov p4.b, p1.b
-; CHECK0-NEXT: mov p0.b, p2.b
-; CHECK0-NEXT: mov p1.b, p3.b
-; CHECK0-NEXT: mov p2.b, p5.b
-; CHECK0-NEXT: mov p3.b, p4.b
-; CHECK0-NEXT: bl sve_signature_pred_2xv4i1
-; CHECK0-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT: addvl sp, sp, #1
-; CHECK0-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK0-NEXT: ret
-;
-; CHECK64-LABEL: sve_signature_pred_2xv4i1_caller:
-; CHECK64: // %bb.0:
-; CHECK64-NEXT: sub sp, sp, #80
-; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT: addvl sp, sp, #-1
-; CHECK64-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT: sub sp, sp, #64
-; CHECK64-NEXT: mov p4.b, p1.b
-; CHECK64-NEXT: mov p5.b, p0.b
-; CHECK64-NEXT: mov p0.b, p2.b
-; CHECK64-NEXT: mov p1.b, p3.b
-; CHECK64-NEXT: mov p2.b, p5.b
-; CHECK64-NEXT: mov p3.b, p4.b
-; CHECK64-NEXT: bl sve_signature_pred_2xv4i1
-; CHECK64-NEXT: add sp, sp, #64
-; CHECK64-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT: addvl sp, sp, #1
-; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK64-NEXT: add sp, sp, #80
-; CHECK64-NEXT: ret
-;
-; CHECK1024-LABEL: sve_signature_pred_2xv4i1_caller:
-; CHECK1024: // %bb.0:
-; CHECK1024-NEXT: sub sp, sp, #1040
-; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT: addvl sp, sp, #-1
-; CHECK1024-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: sub sp, sp, #1024
-; CHECK1024-NEXT: mov p4.b, p1.b
-; CHECK1024-NEXT: mov p5.b, p0.b
-; CHECK1024-NEXT: mov p0.b, p2.b
-; CHECK1024-NEXT: mov p1.b, p3.b
-; CHECK1024-NEXT: mov p2.b, p5.b
-; CHECK1024-NEXT: mov p3.b, p4.b
-; CHECK1024-NEXT: bl sve_signature_pred_2xv4i1
-; CHECK1024-NEXT: add sp, sp, #1024
-; CHECK1024-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: addvl sp, sp, #1
-; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT: add sp, sp, #1040
-; CHECK1024-NEXT: ret
+; CHECK-LABEL: sve_signature_pred_2xv4i1_caller:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p5.b, p0.b
+; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p4.b, p1.b
+; CHECK-NEXT: mov p0.b, p2.b
+; CHECK-NEXT: mov p1.b, p3.b
+; CHECK-NEXT: mov p2.b, p5.b
+; CHECK-NEXT: mov p3.b, p4.b
+; CHECK-NEXT: bl sve_signature_pred_2xv4i1
+; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
%res = call [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1([2 x <vscale x 4 x i1>] %arg2, [2 x <vscale x 4 x i1>] %arg1)
ret [2 x <vscale x 4 x i1>] %res
}
@@ -2113,139 +2066,269 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
; CHECK64-NEXT: .cfi_restore w29
; CHECK64-NEXT: ret
;
-; CHECK1024-LABEL: svecc_call:
-; CHECK1024: // %bb.0: // %entry
-; CHECK1024-NEXT: sub sp, sp, #1088
-; CHECK1024-NEXT: .cfi_def_cfa_offset 1088
-; CHECK1024-NEXT: cntd x9
-; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill
-; CHECK1024-NEXT: add x29, sp, #1024
-; CHECK1024-NEXT: .cfi_def_cfa w29, 64
-; CHECK1024-NEXT: .cfi_offset w19, -16
-; CHECK1024-NEXT: .cfi_offset w26, -24
-; CHECK1024-NEXT: .cfi_offset w27, -32
-; CHECK1024-NEXT: .cfi_offset w28, -40
-; CHECK1024-NEXT: .cfi_offset vg, -48
-; CHECK1024-NEXT: .cfi_offset w30, -56
-; CHECK1024-NEXT: .cfi_offset w29, -64
-; CHECK1024-NEXT: addvl sp, sp, #-18
-; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
-; CHECK1024-NEXT: sub sp, sp, #1024
-; CHECK1024-NEXT: mov x8, x0
-; CHECK1024-NEXT: bl __arm_sme_state
-; CHECK1024-NEXT: mov x19, x0
-; CHECK1024-NEXT: //APP
-; CHECK1024-NEXT: //NO_APP
-; CHECK1024-NEXT: tbz w19, #0, .LBB28_2
-; CHECK1024-NEXT: // %bb.1: // %entry
-; CHECK1024-NEXT: smstop sm
-; CHECK1024-NEXT: .LBB28_2: // %entry
-; CHECK1024-NEXT: mov x0, x8
-; CHECK1024-NEXT: mov w1, #45 // =0x2d
-; CHECK1024-NEXT: mov w2, #37 // =0x25
-; CHECK1024-NEXT: bl memset
-; CHECK1024-NEXT: tbz w19, #0, .LBB28_4
-; CHECK1024-NEXT: // %bb.3: // %entry
-; CHECK1024-NEXT: smstart sm
-; CHECK1024-NEXT: .LBB28_4: // %entry
-; CHECK1024-NEXT: mov w0, #22647 // =0x5877
-; CHECK1024-NEXT: movk w0, #59491, lsl #16
-; CHECK1024-NEXT: add sp, sp, #1024
-; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: addvl sp, sp, #18
-; CHECK1024-NEXT: .cfi_restore z8
-; CHECK1024-NEXT: .cfi_restore z9
-; CHECK1024-NEXT: .cfi_restore z10
-; CHECK1024-NEXT: .cfi_restore z11
-; CHECK1024-NEXT: .cfi_restore z12
-; CHECK1024-NEXT: .cfi_restore z13
-; CHECK1024-NEXT: .cfi_restore z14
-; CHECK1024-NEXT: .cfi_restore z15
-; CHECK1024-NEXT: .cfi_def_cfa wsp, 1088
-; CHECK1024-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT: add sp, sp, #1088
-; CHECK1024-NEXT: .cfi_def_cfa_offset 0
-; CHECK1024-NEXT: .cfi_restore w19
-; CHECK1024-NEXT: .cfi_restore w26
-; CHECK1024-NEXT: .cfi_restore w27
-; CHECK1024-NEXT: .cfi_restore w28
-; CHECK1024-NEXT: .cfi_restore vg
-; CHECK1024-NEXT: .cfi_restore w30
-; CHECK1024-NEXT: .cfi_restore w29
-; CHECK1024-NEXT: ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_call:
+; CHECK1024-NOSPLITSVE: // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 1088
+; CHECK1024-NOSPLITSVE-NEXT: cntd x9
+; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: add x29, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa w29, 64
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w19, -16
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w26, -24
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w27, -32
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w28, -40
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset vg, -48
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w30, -56
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -64
+; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-18
+; CHECK1024-NOSPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT: mov x8, x0
+; CHECK1024-NOSPLITSVE-NEXT: bl __arm_sme_state
+; CHECK1024-NOSPLITSVE-NEXT: mov x19, x0
+; CHECK1024-NOSPLITSVE-NEXT: //APP
+; CHECK1024-NOSPLITSVE-NEXT: //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB28_2
+; CHECK1024-NOSPLITSVE-NEXT: // %bb.1: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: smstop sm
+; CHECK1024-NOSPLITSVE-NEXT: .LBB28_2: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: mov x0, x8
+; CHECK1024-NOSPLITSVE-NEXT: mov w1, #45 // =0x2d
+; CHECK1024-NOSPLITSVE-NEXT: mov w2, #37 // =0x25
+; CHECK1024-NOSPLITSVE-NEXT: bl memset
+; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB28_4
+; CHECK1024-NOSPLITSVE-NEXT: // %bb.3: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: smstart sm
+; CHECK1024-NOSPLITSVE-NEXT: .LBB28_4: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: mov w0, #22647 // =0x5877
+; CHECK1024-NOSPLITSVE-NEXT: movk w0, #59491, lsl #16
+; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #18
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z8
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z9
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z10
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z11
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z12
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z13
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z14
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z15
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa wsp, 1088
+; CHECK1024-NOSPLITSVE-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 0
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w19
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w26
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w27
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w28
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore vg
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w30
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w29
+; CHECK1024-NOSPLITSVE-NEXT: ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_call:
+; CHECK1024-SPLITSVE: // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 64
+; CHECK1024-SPLITSVE-NEXT: cntd x9
+; CHECK1024-SPLITSVE-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: mov x29, sp
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa w29, 64
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w19, -8
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w26, -16
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w27, -24
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w28, -32
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset vg, -48
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w30, -56
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -64
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-2
+; CHECK1024-SPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-16
+; CHECK1024-SPLITSVE-NEXT: str z23, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: mov x8, x0
+; CHECK1024-SPLITSVE-NEXT: bl __arm_sme_state
+; CHECK1024-SPLITSVE-NEXT: mov x19, x0
+; CHECK1024-SPLITSVE-NEXT: //APP
+; CHECK1024-SPLITSVE-NEXT: //NO_APP
+; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB28_2
+; CHECK1024-SPLITSVE-NEXT: // %bb.1: // %entry
+; CHECK1024-SPLITSVE-NEXT: smstop sm
+; CHECK1024-SPLITSVE-NEXT: .LBB28_2: // %entry
+; CHECK1024-SPLITSVE-NEXT: mov x0, x8
+; CHECK1024-SPLITSVE-NEXT: mov w1, #45 // =0x2d
+; CHECK1024-SPLITSVE-NEXT: mov w2, #37 // =0x25
+; CHECK1024-SPLITSVE-NEXT: bl memset
+; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB28_4
+; CHECK1024-SPLITSVE-NEXT: // %bb.3: // %entry
+; CHECK1024-SPLITSVE-NEXT: smstart sm
+; CHECK1024-SPLITSVE-NEXT: .LBB28_4: // %entry
+; CHECK1024-SPLITSVE-NEXT: mov w0, #22647 // =0x5877
+; CHECK1024-SPLITSVE-NEXT: movk w0, #59491, lsl #16
+; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: ldr z23, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #16
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z8
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z9
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z10
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z11
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z12
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z13
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z14
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z15
+; CHECK1024-SPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #2
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa wsp, 64
+; CHECK1024-SPLITSVE-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 0
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w19
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w26
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w27
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w28
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore vg
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w30
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w29
+; CHECK1024-SPLITSVE-NEXT: ret
entry:
tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
%call = call ptr @memset(ptr noundef nonnull %P1, i32 noundef 45, i32 noundef 37)
@@ -2505,138 +2588,267 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
; CHECK64-NEXT: .cfi_restore w29
; CHECK64-NEXT: ret
;
-; CHECK1024-LABEL: svecc_alloca_call:
-; CHECK1024: // %bb.0: // %entry
-; CHECK1024-NEXT: sub sp, sp, #1088
-; CHECK1024-NEXT: .cfi_def_cfa_offset 1088
-; CHECK1024-NEXT: cntd x9
-; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill
-; CHECK1024-NEXT: add x29, sp, #1024
-; CHECK1024-NEXT: .cfi_def_cfa w29, 64
-; CHECK1024-NEXT: .cfi_offset w19, -16
-; CHECK1024-NEXT: .cfi_offset w26, -24
-; CHECK1024-NEXT: .cfi_offset w27, -32
-; CHECK1024-NEXT: .cfi_offset w28, -40
-; CHECK1024-NEXT: .cfi_offset vg, -48
-; CHECK1024-NEXT: .cfi_offset w30, -56
-; CHECK1024-NEXT: .cfi_offset w29, -64
-; CHECK1024-NEXT: addvl sp, sp, #-18
-; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
-; CHECK1024-NEXT: sub sp, sp, #1072
-; CHECK1024-NEXT: bl __arm_sme_state
-; CHECK1024-NEXT: mov x19, x0
-; CHECK1024-NEXT: //APP
-; CHECK1024-NEXT: //NO_APP
-; CHECK1024-NEXT: tbz w19, #0, .LBB29_2
-; CHECK1024-NEXT: // %bb.1: // %entry
-; CHECK1024-NEXT: smstop sm
-; CHECK1024-NEXT: .LBB29_2: // %entry
-; CHECK1024-NEXT: mov x0, sp
-; CHECK1024-NEXT: mov w1, #45 // =0x2d
-; CHECK1024-NEXT: mov w2, #37 // =0x25
-; CHECK1024-NEXT: bl memset
-; CHECK1024-NEXT: tbz w19, #0, .LBB29_4
-; CHECK1024-NEXT: // %bb.3: // %entry
-; CHECK1024-NEXT: smstart sm
-; CHECK1024-NEXT: .LBB29_4: // %entry
-; CHECK1024-NEXT: mov w0, #22647 // =0x5877
-; CHECK1024-NEXT: movk w0, #59491, lsl #16
-; CHECK1024-NEXT: add sp, sp, #1072
-; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: addvl sp, sp, #18
-; CHECK1024-NEXT: .cfi_restore z8
-; CHECK1024-NEXT: .cfi_restore z9
-; CHECK1024-NEXT: .cfi_restore z10
-; CHECK1024-NEXT: .cfi_restore z11
-; CHECK1024-NEXT: .cfi_restore z12
-; CHECK1024-NEXT: .cfi_restore z13
-; CHECK1024-NEXT: .cfi_restore z14
-; CHECK1024-NEXT: .cfi_restore z15
-; CHECK1024-NEXT: .cfi_def_cfa wsp, 1088
-; CHECK1024-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT: add sp, sp, #1088
-; CHECK1024-NEXT: .cfi_def_cfa_offset 0
-; CHECK1024-NEXT: .cfi_restore w19
-; CHECK1024-NEXT: .cfi_restore w26
-; CHECK1024-NEXT: .cfi_restore w27
-; CHECK1024-NEXT: .cfi_restore w28
-; CHECK1024-NEXT: .cfi_restore vg
-; CHECK1024-NEXT: .cfi_restore w30
-; CHECK1024-NEXT: .cfi_restore w29
-; CHECK1024-NEXT: ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_alloca_call:
+; CHECK1024-NOSPLITSVE: // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 1088
+; CHECK1024-NOSPLITSVE-NEXT: cntd x9
+; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: add x29, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa w29, 64
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w19, -16
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w26, -24
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w27, -32
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w28, -40
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset vg, -48
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w30, -56
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -64
+; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-18
+; CHECK1024-NOSPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1072
+; CHECK1024-NOSPLITSVE-NEXT: bl __arm_sme_state
+; CHECK1024-NOSPLITSVE-NEXT: mov x19, x0
+; CHECK1024-NOSPLITSVE-NEXT: //APP
+; CHECK1024-NOSPLITSVE-NEXT: //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB29_2
+; CHECK1024-NOSPLITSVE-NEXT: // %bb.1: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: smstop sm
+; CHECK1024-NOSPLITSVE-NEXT: .LBB29_2: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: mov x0, sp
+; CHECK1024-NOSPLITSVE-NEXT: mov w1, #45 // =0x2d
+; CHECK1024-NOSPLITSVE-NEXT: mov w2, #37 // =0x25
+; CHECK1024-NOSPLITSVE-NEXT: bl memset
+; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB29_4
+; CHECK1024-NOSPLITSVE-NEXT: // %bb.3: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: smstart sm
+; CHECK1024-NOSPLITSVE-NEXT: .LBB29_4: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: mov w0, #22647 // =0x5877
+; CHECK1024-NOSPLITSVE-NEXT: movk w0, #59491, lsl #16
+; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1072
+; CHECK1024-NOSPLITSVE-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #18
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z8
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z9
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z10
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z11
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z12
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z13
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z14
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z15
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa wsp, 1088
+; CHECK1024-NOSPLITSVE-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 0
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w19
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w26
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w27
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w28
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore vg
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w30
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w29
+; CHECK1024-NOSPLITSVE-NEXT: ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_alloca_call:
+; CHECK1024-SPLITSVE: // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 64
+; CHECK1024-SPLITSVE-NEXT: cntd x9
+; CHECK1024-SPLITSVE-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: mov x29, sp
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa w29, 64
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w19, -8
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w26, -16
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w27, -24
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w28, -32
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset vg, -48
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w30, -56
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -64
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-2
+; CHECK1024-SPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-16
+; CHECK1024-SPLITSVE-NEXT: str z23, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1072
+; CHECK1024-SPLITSVE-NEXT: bl __arm_sme_state
+; CHECK1024-SPLITSVE-NEXT: mov x19, x0
+; CHECK1024-SPLITSVE-NEXT: //APP
+; CHECK1024-SPLITSVE-NEXT: //NO_APP
+; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB29_2
+; CHECK1024-SPLITSVE-NEXT: // %bb.1: // %entry
+; CHECK1024-SPLITSVE-NEXT: smstop sm
+; CHECK1024-SPLITSVE-NEXT: .LBB29_2: // %entry
+; CHECK1024-SPLITSVE-NEXT: mov x0, sp
+; CHECK1024-SPLITSVE-NEXT: mov w1, #45 // =0x2d
+; CHECK1024-SPLITSVE-NEXT: mov w2, #37 // =0x25
+; CHECK1024-SPLITSVE-NEXT: bl memset
+; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB29_4
+; CHECK1024-SPLITSVE-NEXT: // %bb.3: // %entry
+; CHECK1024-SPLITSVE-NEXT: smstart sm
+; CHECK1024-SPLITSVE-NEXT: .LBB29_4: // %entry
+; CHECK1024-SPLITSVE-NEXT: mov w0, #22647 // =0x5877
+; CHECK1024-SPLITSVE-NEXT: movk w0, #59491, lsl #16
+; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1072
+; CHECK1024-SPLITSVE-NEXT: ldr z23, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #16
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z8
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z9
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z10
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z11
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z12
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z13
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z14
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z15
+; CHECK1024-SPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #2
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa wsp, 64
+; CHECK1024-SPLITSVE-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 0
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w19
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w26
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w27
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w28
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore vg
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w30
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w29
+; CHECK1024-SPLITSVE-NEXT: ret
entry:
tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
index 7bddd1d..cc63c7f 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
@@ -56,9 +56,9 @@ define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_many_svepred_arg(<
; CHECK: name: caller_with_many_svepred_arg
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 2, alignment: 2,
-; CHECK-NEXT: stack-id: scalable-vector
+; CHECK-NEXT: stack-id: scalable-predicate-vector
; CHECK: - { id: 1, name: '', type: default, offset: 0, size: 2, alignment: 2,
-; CHECK-NEXT: stack-id: scalable-vector
+; CHECK-NEXT: stack-id: scalable-predicate-vector
; CHECK-DAG: STR_PXI %{{[0-9]+}}, %stack.0, 0
; CHECK-DAG: STR_PXI %{{[0-9]+}}, %stack.1, 0
; CHECK-DAG: [[BASE1:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0
@@ -90,7 +90,7 @@ define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_svepred_arg_1xv16i
; CHECK: name: caller_with_svepred_arg_1xv16i1_4xv16i1
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 2, alignment: 2,
-; CHECK-NEXT: stack-id: scalable-vector,
+; CHECK-NEXT: stack-id: scalable-predicate-vector,
; CHECK: [[PRED0:%[0-9]+]]:ppr = COPY $p0
; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
; CHECK: STR_PXI [[PRED0]], %stack.0, 0 :: (store (<vscale x 1 x s16>) into %stack.0)
@@ -139,7 +139,7 @@ define [4 x <vscale x 16 x i1>] @caller_with_svepred_arg_4xv16i1_4xv16i1([4 x <v
; CHECK: name: caller_with_svepred_arg_4xv16i1_4xv16i1
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 2,
-; CHECK-NEXT: stack-id: scalable-vector,
+; CHECK-NEXT: stack-id: scalable-predicate-vector,
; CHECK: [[PRED3:%[0-9]+]]:ppr = COPY $p3
; CHECK: [[PRED2:%[0-9]+]]:ppr = COPY $p2
; CHECK: [[PRED1:%[0-9]+]]:ppr = COPY $p1
@@ -200,7 +200,7 @@ define [2 x <vscale x 32 x i1>] @caller_with_svepred_arg_2xv32i1_1xv16i1([2 x <v
; CHECK: name: caller_with_svepred_arg_2xv32i1_1xv16i1
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 2,
-; CHECK-NEXT: stack-id: scalable-vector,
+; CHECK-NEXT: stack-id: scalable-predicate-vector,
; CHECK: [[PRED3:%[0-9]+]]:ppr = COPY $p3
; CHECK: [[PRED2:%[0-9]+]]:ppr = COPY $p2
; CHECK: [[PRED1:%[0-9]+]]:ppr = COPY $p1
diff --git a/llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll b/llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll
new file mode 100644
index 0000000..584753b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll
@@ -0,0 +1,2854 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mattr=+sve < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @sve_load_store_nxv1i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i8>, ptr %a
+ store <vscale x 1 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv2i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i8>, ptr %a
+ store <vscale x 2 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv3i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i8>, ptr %a
+ store <vscale x 3 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv4i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i8>, ptr %a
+ store <vscale x 4 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv5i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i8>, ptr %a
+ store <vscale x 5 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv6i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ld1b { z1.s }, p1/z, [x0]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1b { z1.s }, p1, [x1]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpkhi z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1b { z0.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i8>, ptr %a
+ store <vscale x 6 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv7i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i8>, ptr %a
+ store <vscale x 7 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv8i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i8>, ptr %a
+ store <vscale x 8 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv9i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv9i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #9 // =0x9
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 9 x i8>, ptr %a
+ store <vscale x 9 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv10i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv10i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT: ld1b { z1.h }, p1/z, [x0]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: st1b { z0.h }, p1, [x1]
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1b { z1.d }, p0, [x1, #4, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 10 x i8>, ptr %a
+ store <vscale x 10 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv11i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv11i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #11 // =0xb
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 11 x i8>, ptr %a
+ store <vscale x 11 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv12i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv12i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ld1b { z1.h }, p1/z, [x0]
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: st1b { z0.h }, p1, [x1]
+; CHECK-NEXT: st1b { z1.s }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 12 x i8>, ptr %a
+ store <vscale x 12 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv13i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv13i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #13 // =0xd
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 13 x i8>, ptr %a
+ store <vscale x 13 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv14i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv14i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT: ptrue p2.h
+; CHECK-NEXT: ld1b { z1.s }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: ld1b { z1.h }, p2/z, [x0]
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z2.s, z1.h
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: st1b { z0.h }, p2, [x1]
+; CHECK-NEXT: uunpklo z2.d, z2.s
+; CHECK-NEXT: st1b { z1.s }, p1, [x1, #2, mul vl]
+; CHECK-NEXT: st1b { z2.d }, p0, [x1, #6, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 14 x i8>, ptr %a
+ store <vscale x 14 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv15i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv15i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #15 // =0xf
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 15 x i8>, ptr %a
+ store <vscale x 15 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv16i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 16 x i8>, ptr %a
+ store <vscale x 16 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv17i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv17i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #17 // =0x11
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 17 x i8>, ptr %a
+ store <vscale x 17 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv18i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv18i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x8]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z1.s, z0.h
+; CHECK-NEXT: uunpkhi z0.s, z0.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpkhi z1.d, z1.s
+; CHECK-NEXT: uzp1 z1.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpklo z2.s, z1.h
+; CHECK-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEXT: uzp1 z2.s, z0.s, z2.s
+; CHECK-NEXT: uzp1 z1.h, z2.h, z1.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpklo z2.s, z1.h
+; CHECK-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEXT: uunpklo z2.d, z2.s
+; CHECK-NEXT: uzp1 z2.s, z2.s, z0.s
+; CHECK-NEXT: uzp1 z1.h, z2.h, z1.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpkhi z2.s, z1.h
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEXT: uzp1 z2.s, z0.s, z2.s
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpkhi z2.s, z1.h
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z2.d, z2.s
+; CHECK-NEXT: uzp1 z2.s, z2.s, z0.s
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1b { z0.d }, p0, [x1, x8]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 18 x i8>, ptr %a
+ store <vscale x 18 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv19i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv19i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #19 // =0x13
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 19 x i8>, ptr %a
+ store <vscale x 19 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv20i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv20i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1b { z1.s }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEXT: uzp1 z1.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: st1b { z0.s }, p0, [x1, #4, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 20 x i8>, ptr %a
+ store <vscale x 20 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv21i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv21i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #21 // =0x15
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 21 x i8>, ptr %a
+ store <vscale x 21 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv22i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv22i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: cntw x8, all, mul #5
+; CHECK-NEXT: ldr z2, [x0]
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT: ld1b { z1.d }, p1/z, [x0, x8]
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uzp1 z1.s, z1.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEXT: uzp1 z1.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1b { z1.d }, p1, [x1, x8]
+; CHECK-NEXT: st1b { z0.s }, p0, [x1, #4, mul vl]
+; CHECK-NEXT: str z2, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 22 x i8>, ptr %a
+ store <vscale x 22 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv23i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv23i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #23 // =0x17
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 23 x i8>, ptr %a
+ store <vscale x 23 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv24i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv24i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: st1b { z0.h }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 24 x i8>, ptr %a
+ store <vscale x 24 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv25i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv25i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #25 // =0x19
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 25 x i8>, ptr %a
+ store <vscale x 25 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv26i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv26i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: cnth x8, all, mul #3
+; CHECK-NEXT: ldr z2, [x0]
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x8]
+; CHECK-NEXT: ld1b { z1.h }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1b { z1.d }, p0, [x1, x8]
+; CHECK-NEXT: st1b { z0.h }, p1, [x1, #2, mul vl]
+; CHECK-NEXT: str z2, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 26 x i8>, ptr %a
+ store <vscale x 26 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv27i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv27i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #27 // =0x1b
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 27 x i8>, ptr %a
+ store <vscale x 27 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv28i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv28i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldr z2, [x0]
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT: ld1b { z1.h }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z2, [x1]
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: st1b { z0.h }, p1, [x1, #2, mul vl]
+; CHECK-NEXT: st1b { z1.s }, p0, [x1, #6, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 28 x i8>, ptr %a
+ store <vscale x 28 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv29i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv29i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #29 // =0x1d
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 29 x i8>, ptr %a
+ store <vscale x 29 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv30i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv30i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: cntw x8, all, mul #7
+; CHECK-NEXT: ldr z3, [x0]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x8]
+; CHECK-NEXT: ptrue p2.h
+; CHECK-NEXT: ld1b { z1.s }, p1/z, [x0, #6, mul vl]
+; CHECK-NEXT: ld1b { z2.h }, p2/z, [x0, #2, mul vl]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z2.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z2.s, z1.h
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z2.d, z2.s
+; CHECK-NEXT: st1b { z2.d }, p0, [x1, x8]
+; CHECK-NEXT: st1b { z0.h }, p2, [x1, #2, mul vl]
+; CHECK-NEXT: st1b { z1.s }, p1, [x1, #6, mul vl]
+; CHECK-NEXT: str z3, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 30 x i8>, ptr %a
+ store <vscale x 30 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv31i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv31i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #31 // =0x1f
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 31 x i8>, ptr %a
+ store <vscale x 31 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv32i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv32i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 32 x i8>, ptr %a
+ store <vscale x 32 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv1i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i16>, ptr %a
+ store <vscale x 1 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv2i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i16>, ptr %a
+ store <vscale x 2 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv3i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i16>, ptr %a
+ store <vscale x 3 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv4i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i16>, ptr %a
+ store <vscale x 4 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv5i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i16>, ptr %a
+ store <vscale x 5 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv6i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1h { z0.s }, p1, [x1]
+; CHECK-NEXT: st1h { z1.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i16>, ptr %a
+ store <vscale x 6 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv7i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i16>, ptr %a
+ store <vscale x 7 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv8i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i16>, ptr %a
+ store <vscale x 8 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv9i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv9i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #9 // =0x9
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 9 x i16>, ptr %a
+ store <vscale x 9 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv10i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv10i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z1.h, z0.h, z0.h
+; CHECK-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEXT: uunpkhi z1.d, z1.s
+; CHECK-NEXT: uzp1 z1.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z1.h, z0.h, z1.h
+; CHECK-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [x1, #4, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 10 x i16>, ptr %a
+ store <vscale x 10 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv11i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv11i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #11 // =0xb
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 11 x i16>, ptr %a
+ store <vscale x 11 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv12i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv12i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: st1h { z0.s }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 12 x i16>, ptr %a
+ store <vscale x 12 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv13i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv13i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #13 // =0xd
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 13 x i16>, ptr %a
+ store <vscale x 13 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv14i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv14i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z2, [x0]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z2, [x1]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1h { z0.s }, p1, [x1, #2, mul vl]
+; CHECK-NEXT: st1h { z1.d }, p0, [x1, #6, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 14 x i16>, ptr %a
+ store <vscale x 14 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv15i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv15i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #15 // =0xf
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 15 x i16>, ptr %a
+ store <vscale x 15 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv16i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv16i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 16 x i16>, ptr %a
+ store <vscale x 16 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv1i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i32>, ptr %a
+ store <vscale x 1 x i32> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv2i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i32>, ptr %a
+ store <vscale x 2 x i32> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv3i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i32>, ptr %a
+ store <vscale x 3 x i32> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv4i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i32>, ptr %a
+ store <vscale x 4 x i32> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv5i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i32>, ptr %a
+ store <vscale x 5 x i32> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv6i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1w { z0.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i32>, ptr %a
+ store <vscale x 6 x i32> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv7i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i32>, ptr %a
+ store <vscale x 7 x i32> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv8i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i32>, ptr %a
+ store <vscale x 8 x i32> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv1i64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i64>, ptr %a
+ store <vscale x 1 x i64> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv2i64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i64>, ptr %a
+ store <vscale x 2 x i64> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv3i64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1d { z1.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i64>, ptr %a
+ store <vscale x 3 x i64> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv4i64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i64>, ptr %a
+ store <vscale x 4 x i64> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv1f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x half>, ptr %a
+ store <vscale x 1 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv2f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x half>, ptr %a
+ store <vscale x 2 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv3f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x half>, ptr %a
+ store <vscale x 3 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv4f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x half>, ptr %a
+ store <vscale x 4 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv5f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x half>, ptr %a
+ store <vscale x 5 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv6f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1h { z1.s }, p1, [x1]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpkhi z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x half>, ptr %a
+ store <vscale x 6 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv7f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x half>, ptr %a
+ store <vscale x 7 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv8f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x half>, ptr %a
+ store <vscale x 8 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv9f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv9f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #9 // =0x9
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 9 x half>, ptr %a
+ store <vscale x 9 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv10f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv10f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: st1h { z1.d }, p0, [x1, #4, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 10 x half>, ptr %a
+ store <vscale x 10 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv11f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv11f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #11 // =0xb
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 11 x half>, ptr %a
+ store <vscale x 11 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv12f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv12f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: st1h { z1.s }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 12 x half>, ptr %a
+ store <vscale x 12 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv13f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv13f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #13 // =0xd
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 13 x half>, ptr %a
+ store <vscale x 13 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv14f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv14f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z2, [x0]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z2, [x1]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1h { z1.s }, p1, [x1, #2, mul vl]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpkhi z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [x1, #6, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 14 x half>, ptr %a
+ store <vscale x 14 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv15f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv15f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #15 // =0xf
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 15 x half>, ptr %a
+ store <vscale x 15 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv16f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv16f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 16 x half>, ptr %a
+ store <vscale x 16 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv1f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x float>, ptr %a
+ store <vscale x 1 x float> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv2f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x float>, ptr %a
+ store <vscale x 2 x float> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv3f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x float>, ptr %a
+ store <vscale x 3 x float> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv4f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x float>, ptr %a
+ store <vscale x 4 x float> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv5f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x float>, ptr %a
+ store <vscale x 5 x float> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv6f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: st1w { z1.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x float>, ptr %a
+ store <vscale x 6 x float> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv7f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x float>, ptr %a
+ store <vscale x 7 x float> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv8f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x float>, ptr %a
+ store <vscale x 8 x float> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv1f64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x double>, ptr %a
+ store <vscale x 1 x double> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv2f64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x double>, ptr %a
+ store <vscale x 2 x double> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv3f64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1d { z1.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x double>, ptr %a
+ store <vscale x 3 x double> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv4f64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x double>, ptr %a
+ store <vscale x 4 x double> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv1bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x bfloat>, ptr %a
+ store <vscale x 1 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv2bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x bfloat>, ptr %a
+ store <vscale x 2 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv3bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x bfloat>, ptr %a
+ store <vscale x 3 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv4bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x bfloat>, ptr %a
+ store <vscale x 4 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv5bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x bfloat>, ptr %a
+ store <vscale x 5 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv6bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1h { z1.s }, p1, [x1]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpkhi z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x bfloat>, ptr %a
+ store <vscale x 6 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv7bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x bfloat>, ptr %a
+ store <vscale x 7 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv8bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x bfloat>, ptr %a
+ store <vscale x 8 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv9bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv9bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #9 // =0x9
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 9 x bfloat>, ptr %a
+ store <vscale x 9 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv10bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv10bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: st1h { z1.d }, p0, [x1, #4, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 10 x bfloat>, ptr %a
+ store <vscale x 10 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv11bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv11bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #11 // =0xb
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 11 x bfloat>, ptr %a
+ store <vscale x 11 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv12bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv12bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: st1h { z1.s }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 12 x bfloat>, ptr %a
+ store <vscale x 12 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv13bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv13bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #13 // =0xd
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 13 x bfloat>, ptr %a
+ store <vscale x 13 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv14bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv14bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z2, [x0]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z2, [x1]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1h { z1.s }, p1, [x1, #2, mul vl]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpkhi z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [x1, #6, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 14 x bfloat>, ptr %a
+ store <vscale x 14 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv15bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv15bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #15 // =0xf
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 15 x bfloat>, ptr %a
+ store <vscale x 15 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv16bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv16bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 16 x bfloat>, ptr %a
+ store <vscale x 16 x bfloat> %c, ptr %b
+ ret void
+}
+
+define <vscale x 1 x i16> @sve_sextload_nxv1i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv1i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i8>, ptr %a
+ %c.sext = sext <vscale x 1 x i8> %c to <vscale x 1 x i16>
+ ret <vscale x 1 x i16> %c.sext
+}
+
+define <vscale x 2 x i16> @sve_sextload_nxv2i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i8>, ptr %a
+ %c.sext = sext <vscale x 2 x i8> %c to <vscale x 2 x i16>
+ ret <vscale x 2 x i16> %c.sext
+}
+
+define <vscale x 3 x i16> @sve_sextload_nxv3i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv3i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i8>, ptr %a
+ %c.sext = sext <vscale x 3 x i8> %c to <vscale x 3 x i16>
+ ret <vscale x 3 x i16> %c.sext
+}
+
+define <vscale x 4 x i16> @sve_sextload_nxv4i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv4i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i8>, ptr %a
+ %c.sext = sext <vscale x 4 x i8> %c to <vscale x 4 x i16>
+ ret <vscale x 4 x i16> %c.sext
+}
+
+define <vscale x 5 x i16> @sve_sextload_nxv5i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv5i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i8>, ptr %a
+ %c.sext = sext <vscale x 5 x i8> %c to <vscale x 5 x i16>
+ ret <vscale x 5 x i16> %c.sext
+}
+
+define <vscale x 6 x i16> @sve_sextload_nxv6i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv6i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntd x8, all, mul #3
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i8>, ptr %a
+ %c.sext = sext <vscale x 6 x i8> %c to <vscale x 6 x i16>
+ ret <vscale x 6 x i16> %c.sext
+}
+
+define <vscale x 7 x i16> @sve_sextload_nxv7i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv7i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i8>, ptr %a
+ %c.sext = sext <vscale x 7 x i8> %c to <vscale x 7 x i16>
+ ret <vscale x 7 x i16> %c.sext
+}
+
+define <vscale x 8 x i16> @sve_sextload_nxv8i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i8>, ptr %a
+ %c.sext = sext <vscale x 8 x i8> %c to <vscale x 8 x i16>
+ ret <vscale x 8 x i16> %c.sext
+}
+
+define <vscale x 9 x i16> @sve_sextload_nxv9i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv9i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #9 // =0x9
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 9 x i8>, ptr %a
+ %c.sext = sext <vscale x 9 x i8> %c to <vscale x 9 x i16>
+ ret <vscale x 9 x i16> %c.sext
+}
+
+define <vscale x 10 x i16> @sve_sextload_nxv10i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv10i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8, all, mul #5
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: str z1, [sp]
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [sp, #4, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 10 x i8>, ptr %a
+ %c.sext = sext <vscale x 10 x i8> %c to <vscale x 10 x i16>
+ ret <vscale x 10 x i16> %c.sext
+}
+
+define <vscale x 11 x i16> @sve_sextload_nxv11i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv11i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #11 // =0xb
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 11 x i8>, ptr %a
+ %c.sext = sext <vscale x 11 x i8> %c to <vscale x 11 x i16>
+ ret <vscale x 11 x i16> %c.sext
+}
+
+define <vscale x 12 x i16> @sve_sextload_nxv12i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv12i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntw x8, all, mul #3
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: str z1, [sp]
+; CHECK-NEXT: st1h { z0.s }, p1, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 12 x i8>, ptr %a
+ %c.sext = sext <vscale x 12 x i8> %c to <vscale x 12 x i16>
+ ret <vscale x 12 x i16> %c.sext
+}
+
+define <vscale x 13 x i16> @sve_sextload_nxv13i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv13i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #13 // =0xd
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 13 x i8>, ptr %a
+ %c.sext = sext <vscale x 13 x i8> %c to <vscale x 13 x i16>
+ ret <vscale x 13 x i16> %c.sext
+}
+
+define <vscale x 14 x i16> @sve_sextload_nxv14i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv14i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8, all, mul #7
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1sb { z2.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: str z2, [sp]
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1h { z0.s }, p1, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: st1h { z1.d }, p0, [sp, #6, mul vl]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 14 x i8>, ptr %a
+ %c.sext = sext <vscale x 14 x i8> %c to <vscale x 14 x i16>
+ ret <vscale x 14 x i16> %c.sext
+}
+
+define <vscale x 15 x i16> @sve_sextload_nxv15i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv15i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #15 // =0xf
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 15 x i8>, ptr %a
+ %c.sext = sext <vscale x 15 x i8> %c to <vscale x 15 x i16>
+ ret <vscale x 15 x i16> %c.sext
+}
+
+define <vscale x 16 x i16> @sve_sextload_nxv16i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 16 x i8>, ptr %a
+ %c.sext = sext <vscale x 16 x i8> %c to <vscale x 16 x i16>
+ ret <vscale x 16 x i16> %c.sext
+}
+
+define <vscale x 1 x i32> @sve_sextload_nxv1i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv1i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i16>, ptr %a
+ %c.sext = sext <vscale x 1 x i16> %c to <vscale x 1 x i32>
+ ret <vscale x 1 x i32> %c.sext
+}
+
+define <vscale x 2 x i32> @sve_sextload_nxv2i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i16>, ptr %a
+ %c.sext = sext <vscale x 2 x i16> %c to <vscale x 2 x i32>
+ ret <vscale x 2 x i32> %c.sext
+}
+
+define <vscale x 3 x i32> @sve_sextload_nxv3i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv3i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i16>, ptr %a
+ %c.sext = sext <vscale x 3 x i16> %c to <vscale x 3 x i32>
+ ret <vscale x 3 x i32> %c.sext
+}
+
+define <vscale x 4 x i32> @sve_sextload_nxv4i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i16>, ptr %a
+ %c.sext = sext <vscale x 4 x i16> %c to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %c.sext
+}
+
+define <vscale x 5 x i32> @sve_sextload_nxv5i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv5i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i16>, ptr %a
+ %c.sext = sext <vscale x 5 x i16> %c to <vscale x 5 x i32>
+ ret <vscale x 5 x i32> %c.sext
+}
+
+define <vscale x 6 x i32> @sve_sextload_nxv6i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv6i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8, all, mul #3
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: str z1, [sp]
+; CHECK-NEXT: st1w { z0.d }, p1, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i16>, ptr %a
+ %c.sext = sext <vscale x 6 x i16> %c to <vscale x 6 x i32>
+ ret <vscale x 6 x i32> %c.sext
+}
+
+define <vscale x 7 x i32> @sve_sextload_nxv7i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv7i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i16>, ptr %a
+ %c.sext = sext <vscale x 7 x i16> %c to <vscale x 7 x i32>
+ ret <vscale x 7 x i32> %c.sext
+}
+
+define <vscale x 8 x i32> @sve_sextload_nxv8i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i16>, ptr %a
+ %c.sext = sext <vscale x 8 x i16> %c to <vscale x 8 x i32>
+ ret <vscale x 8 x i32> %c.sext
+}
+
+define <vscale x 1 x i64> @sve_sextload_nxv1i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv1i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i32>, ptr %a
+ %c.sext = sext <vscale x 1 x i32> %c to <vscale x 1 x i64>
+ ret <vscale x 1 x i64> %c.sext
+}
+
+define <vscale x 2 x i64> @sve_sextload_nxv2i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i32>, ptr %a
+ %c.sext = sext <vscale x 2 x i32> %c to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %c.sext
+}
+
+define <vscale x 3 x i64> @sve_sextload_nxv3i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv3i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sw { z0.d }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1d { z1.d }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i32>, ptr %a
+ %c.sext = sext <vscale x 3 x i32> %c to <vscale x 3 x i64>
+ ret <vscale x 3 x i64> %c.sext
+}
+
+define <vscale x 4 x i64> @sve_sextload_nxv4i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i32>, ptr %a
+ %c.sext = sext <vscale x 4 x i32> %c to <vscale x 4 x i64>
+ ret <vscale x 4 x i64> %c.sext
+}
+
+define <vscale x 1 x i16> @sve_zextload_nxv1i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv1i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i8>, ptr %a
+ %c.zext = sext <vscale x 1 x i8> %c to <vscale x 1 x i16>
+ ret <vscale x 1 x i16> %c.zext
+}
+
+define <vscale x 2 x i16> @sve_zextload_nxv2i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i8>, ptr %a
+ %c.zext = sext <vscale x 2 x i8> %c to <vscale x 2 x i16>
+ ret <vscale x 2 x i16> %c.zext
+}
+
+define <vscale x 3 x i16> @sve_zextload_nxv3i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv3i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i8>, ptr %a
+ %c.zext = sext <vscale x 3 x i8> %c to <vscale x 3 x i16>
+ ret <vscale x 3 x i16> %c.zext
+}
+
+define <vscale x 4 x i16> @sve_zextload_nxv4i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv4i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i8>, ptr %a
+ %c.zext = sext <vscale x 4 x i8> %c to <vscale x 4 x i16>
+ ret <vscale x 4 x i16> %c.zext
+}
+
+define <vscale x 5 x i16> @sve_zextload_nxv5i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv5i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i8>, ptr %a
+ %c.zext = sext <vscale x 5 x i8> %c to <vscale x 5 x i16>
+ ret <vscale x 5 x i16> %c.zext
+}
+
+define <vscale x 6 x i16> @sve_zextload_nxv6i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv6i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntd x8, all, mul #3
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i8>, ptr %a
+ %c.zext = sext <vscale x 6 x i8> %c to <vscale x 6 x i16>
+ ret <vscale x 6 x i16> %c.zext
+}
+
+define <vscale x 7 x i16> @sve_zextload_nxv7i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv7i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i8>, ptr %a
+ %c.zext = sext <vscale x 7 x i8> %c to <vscale x 7 x i16>
+ ret <vscale x 7 x i16> %c.zext
+}
+
+define <vscale x 8 x i16> @sve_zextload_nxv8i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i8>, ptr %a
+ %c.zext = sext <vscale x 8 x i8> %c to <vscale x 8 x i16>
+ ret <vscale x 8 x i16> %c.zext
+}
+
+define <vscale x 9 x i16> @sve_zextload_nxv9i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv9i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #9 // =0x9
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 9 x i8>, ptr %a
+ %c.zext = sext <vscale x 9 x i8> %c to <vscale x 9 x i16>
+ ret <vscale x 9 x i16> %c.zext
+}
+
+define <vscale x 10 x i16> @sve_zextload_nxv10i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv10i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8, all, mul #5
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: str z1, [sp]
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [sp, #4, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 10 x i8>, ptr %a
+ %c.zext = sext <vscale x 10 x i8> %c to <vscale x 10 x i16>
+ ret <vscale x 10 x i16> %c.zext
+}
+
+define <vscale x 11 x i16> @sve_zextload_nxv11i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv11i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #11 // =0xb
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 11 x i8>, ptr %a
+ %c.zext = sext <vscale x 11 x i8> %c to <vscale x 11 x i16>
+ ret <vscale x 11 x i16> %c.zext
+}
+
+define <vscale x 12 x i16> @sve_zextload_nxv12i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv12i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntw x8, all, mul #3
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: str z1, [sp]
+; CHECK-NEXT: st1h { z0.s }, p1, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 12 x i8>, ptr %a
+ %c.zext = sext <vscale x 12 x i8> %c to <vscale x 12 x i16>
+ ret <vscale x 12 x i16> %c.zext
+}
+
+define <vscale x 13 x i16> @sve_zextload_nxv13i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv13i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #13 // =0xd
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 13 x i8>, ptr %a
+ %c.zext = sext <vscale x 13 x i8> %c to <vscale x 13 x i16>
+ ret <vscale x 13 x i16> %c.zext
+}
+
+define <vscale x 14 x i16> @sve_zextload_nxv14i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv14i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8, all, mul #7
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1sb { z2.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: str z2, [sp]
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1h { z0.s }, p1, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: st1h { z1.d }, p0, [sp, #6, mul vl]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 14 x i8>, ptr %a
+ %c.zext = sext <vscale x 14 x i8> %c to <vscale x 14 x i16>
+ ret <vscale x 14 x i16> %c.zext
+}
+
+define <vscale x 15 x i16> @sve_zextload_nxv15i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv15i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #15 // =0xf
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 15 x i8>, ptr %a
+ %c.zext = sext <vscale x 15 x i8> %c to <vscale x 15 x i16>
+ ret <vscale x 15 x i16> %c.zext
+}
+
+define <vscale x 16 x i16> @sve_zextload_nxv16i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 16 x i8>, ptr %a
+ %c.zext = sext <vscale x 16 x i8> %c to <vscale x 16 x i16>
+ ret <vscale x 16 x i16> %c.zext
+}
+
+define <vscale x 1 x i32> @sve_zextload_nxv1i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv1i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i16>, ptr %a
+ %c.zext = sext <vscale x 1 x i16> %c to <vscale x 1 x i32>
+ ret <vscale x 1 x i32> %c.zext
+}
+
+define <vscale x 2 x i32> @sve_zextload_nxv2i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i16>, ptr %a
+ %c.zext = sext <vscale x 2 x i16> %c to <vscale x 2 x i32>
+ ret <vscale x 2 x i32> %c.zext
+}
+
+define <vscale x 3 x i32> @sve_zextload_nxv3i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv3i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i16>, ptr %a
+ %c.zext = sext <vscale x 3 x i16> %c to <vscale x 3 x i32>
+ ret <vscale x 3 x i32> %c.zext
+}
+
+define <vscale x 4 x i32> @sve_zextload_nxv4i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i16>, ptr %a
+ %c.zext = sext <vscale x 4 x i16> %c to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %c.zext
+}
+
+define <vscale x 5 x i32> @sve_zextload_nxv5i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv5i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i16>, ptr %a
+ %c.zext = sext <vscale x 5 x i16> %c to <vscale x 5 x i32>
+ ret <vscale x 5 x i32> %c.zext
+}
+
+define <vscale x 6 x i32> @sve_zextload_nxv6i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv6i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8, all, mul #3
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: str z1, [sp]
+; CHECK-NEXT: st1w { z0.d }, p1, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i16>, ptr %a
+ %c.zext = sext <vscale x 6 x i16> %c to <vscale x 6 x i32>
+ ret <vscale x 6 x i32> %c.zext
+}
+
+define <vscale x 7 x i32> @sve_zextload_nxv7i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv7i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i16>, ptr %a
+ %c.zext = sext <vscale x 7 x i16> %c to <vscale x 7 x i32>
+ ret <vscale x 7 x i32> %c.zext
+}
+
+define <vscale x 8 x i32> @sve_zextload_nxv8i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i16>, ptr %a
+ %c.zext = sext <vscale x 8 x i16> %c to <vscale x 8 x i32>
+ ret <vscale x 8 x i32> %c.zext
+}
+
+define <vscale x 1 x i64> @sve_zextload_nxv1i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv1i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i32>, ptr %a
+ %c.zext = sext <vscale x 1 x i32> %c to <vscale x 1 x i64>
+ ret <vscale x 1 x i64> %c.zext
+}
+
+define <vscale x 2 x i64> @sve_zextload_nxv2i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i32>, ptr %a
+ %c.zext = sext <vscale x 2 x i32> %c to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %c.zext
+}
+
+define <vscale x 3 x i64> @sve_zextload_nxv3i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv3i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sw { z0.d }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1d { z1.d }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i32>, ptr %a
+ %c.zext = sext <vscale x 3 x i32> %c to <vscale x 3 x i64>
+ ret <vscale x 3 x i64> %c.zext
+}
+
+define <vscale x 4 x i64> @sve_zextload_nxv4i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i32>, ptr %a
+ %c.zext = sext <vscale x 4 x i32> %c to <vscale x 4 x i64>
+ ret <vscale x 4 x i64> %c.zext
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
index 2cbb29e..d8de12c 100644
--- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
+++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
@@ -672,5 +672,3 @@ entry:
ret i32 %x
}
declare void @other()
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-FRAMELAYOUT: {{.*}}