diff options
Diffstat (limited to 'llvm/test/CodeGen')
25 files changed, 5479 insertions, 573 deletions
diff --git a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir index aca2816..7fd0cee 100644 --- a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir +++ b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir @@ -164,10 +164,10 @@ stack: - { id: 1, name: z1.addr, size: 16, alignment: 16, stack-id: scalable-vector, debug-info-variable: '!31', debug-info-expression: '!DIExpression()', debug-info-location: '!32' } - - { id: 2, name: p0.addr, size: 2, alignment: 2, stack-id: scalable-vector, + - { id: 2, name: p0.addr, size: 2, alignment: 2, stack-id: scalable-predicate-vector, debug-info-variable: '!33', debug-info-expression: '!DIExpression()', debug-info-location: '!34' } - - { id: 3, name: p1.addr, size: 2, alignment: 2, stack-id: scalable-vector, + - { id: 3, name: p1.addr, size: 2, alignment: 2, stack-id: scalable-predicate-vector, debug-info-variable: '!35', debug-info-expression: '!DIExpression()', debug-info-location: '!36' } - { id: 4, name: w0.addr, size: 4, alignment: 4, local-offset: -4, debug-info-variable: '!37', @@ -181,10 +181,10 @@ stack: - { id: 7, name: localv1, size: 16, alignment: 16, stack-id: scalable-vector, debug-info-variable: '!45', debug-info-expression: '!DIExpression()', debug-info-location: '!46' } - - { id: 8, name: localp0, size: 2, alignment: 2, stack-id: scalable-vector, + - { id: 8, name: localp0, size: 2, alignment: 2, stack-id: scalable-predicate-vector, debug-info-variable: '!48', debug-info-expression: '!DIExpression()', debug-info-location: '!49' } - - { id: 9, name: localp1, size: 2, alignment: 2, stack-id: scalable-vector, + - { id: 9, name: localp1, size: 2, alignment: 2, stack-id: scalable-predicate-vector, debug-info-variable: '!51', debug-info-expression: '!DIExpression()', debug-info-location: '!52' } machineFunctionInfo: {} diff --git a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir index 0ea180b..41ba554 100644 --- a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir +++ b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir @@ -96,8 +96,8 @@ stack: - { id: 1, size: 8, alignment: 8 } - { id: 2, size: 16, alignment: 16, stack-id: scalable-vector } - { id: 3, size: 16, alignment: 16, stack-id: scalable-vector } - - { id: 4, size: 2, alignment: 2, stack-id: scalable-vector } - - { id: 5, size: 2, alignment: 2, stack-id: scalable-vector } + - { id: 4, size: 2, alignment: 2, stack-id: scalable-predicate-vector } + - { id: 5, size: 2, alignment: 2, stack-id: scalable-predicate-vector } machineFunctionInfo: {} body: | bb.0.entry: diff --git a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir new file mode 100644 index 0000000..35eafe8 --- /dev/null +++ b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir @@ -0,0 +1,587 @@ +# RUN: llc -mattr=+sve -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -mtriple=aarch64-none-linux-gnu -run-pass=prologepilog %s -o - | FileCheck %s +# RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -start-before=prologepilog %s -o - | FileCheck %s --check-prefix=ASM +# RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -start-before=prologepilog %s -filetype=obj -o %t +# RUN: llvm-objdump --dwarf=frames %t | FileCheck %s --check-prefix=UNWINDINFO +# RUN: rm -rf %t +# +# Test allocation and deallocation of SVE objects on the stack with +# split-sve-objects (and hazard padding) enabled. This also tests using a +# combination of scalable and non-scalable offsets to access the SVE on the +# stack. +# +# With split-sve-objects (which implies hazard padding) the SVE area is split +# into PPR and ZPR areas with (fixed-size) hazard padding between them. The PPR +# area holds all scalable predicate callee saves and locals, and the ZPR area +# holds all scalable vector callee saves and locals. Additionally, any FPR +# callee save is promoted to a ZPR callee save (to avoid needing additional +# hazard padding in the callee save area). +# +# +-------------+ +# | stack arg | +# +-------------+ <- SP before call +# | Callee Saves| +# | Frame record| (if available) +# |-------------| <- FP (if available) +# | PPR area | +# |-------------| +# |/////////////| hazard padding +# |-------------| +# | ZPR area | +# +-------------+ +# | : | +# | Stack objs | +# | : | +# +-------------+ <- SP after call and frame-setup +# +--- | + + define void @test_allocate_split_sve() uwtable { entry: unreachable } + define void @test_allocate_split_sve_realigned() uwtable { entry: unreachable } + define void @test_address_split_sve() uwtable { entry: unreachable } + define void @test_address_split_sve_fp() uwtable { entry: unreachable } + define aarch64_sve_vector_pcs void @save_restore_ppr_zpr() uwtable { entry: unreachable } + +... +--- +# +----------+ +# |scratchreg| // x29 is used as scratch reg. +# |----------| +# | %stack.0 | // scalable predicate of n * 12 bytes, aligned to 16 bytes +# | | // to be materialized with 1*ADDVL (<=> n * 16 bytes) +# |----------| +# |//////////| // hazard padding (1024 bytes) -- part of PPR locals area +# |//////////| // Note: This is currently not included in the "stackSize" +# +----------+ +# | %stack.0 | // scalable SVE object of n * 18 bytes, aligned to 16 bytes, +# | | // to be materialized with 2*ADDVL (<=> 2 * n * 16 bytes) +# +----------+ +# |//////////| // hazard padding (1024 bytes) +# |----------| +# | %stack.1 | // not scalable +# +----------+ <- SP + +# CHECK-LABEL: name: test_allocate_split_sve +# CHECK: stackSize: 1056 + +# CHECK: bb.0.entry: +# CHECK: liveins: $z0, $p0, $fp +# CHECK: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.4) +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 +# +# CHECK-NEXT: $x8 = ADDXri $sp, 1040, 0 +# CHECK-NEXT: $x8 = ADDPL_XXI $x8, 7, implicit $vg +# CHECK-NEXT: STR_ZXI $z0, killed $x8, 0 :: (store (<vscale x 1 x s128>) into %stack.0) +# CHECK-NEXT: $x8 = ADDXri $sp, 2064, 0 +# CHECK-NEXT: STR_PXI $p0, killed $x8, 18 :: (store (<vscale x 1 x s16>) into %stack.1) +# +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1056 +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16 +# CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.4) +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29 +# CHECK-NEXT: RET_ReallyLR + +# ASM-LABEL: test_allocate_split_sve: +# ASM: str x29, [sp, #-16]! +# ASM-NEXT: .cfi_def_cfa_offset 16 +# ASM-NEXT: .cfi_offset w29, -16 +# ASM-NEXT: sub sp, sp, #1024 +# ASM-NEXT: .cfi_def_cfa_offset 1040 +# ASM-NEXT: addvl sp, sp, #-1 +# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG +# ASM-NEXT: sub sp, sp, #1040 +# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG +# ASM-NEXT: addvl sp, sp, #-2 +# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 2080 + 24 * VG +# +# ASM: addvl sp, sp, #2 +# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG +# ASM-NEXT: add sp, sp, #1024 +# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1056 + 8 * VG +# ASM-NEXT: addvl sp, sp, #1 +# ASM-NEXT: .cfi_def_cfa wsp, 1056 +# ASM-NEXT: add sp, sp, #1040 +# ASM-NEXT: .cfi_def_cfa_offset 16 +# ASM-NEXT: ldr x29, [sp], #16 +# ASM-NEXT: .cfi_def_cfa_offset 0 +# ASM-NEXT: .cfi_restore w29 + +# UNWINDINFO: DW_CFA_def_cfa_offset: +16 +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 +# UNWINDINFO: DW_CFA_def_cfa_offset: +1040 +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus +# +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1056, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa: reg31 +1056 +# UNWINDINFO: DW_CFA_def_cfa_offset: +16 +# UNWINDINFO: DW_CFA_def_cfa_offset: +0 +# UNWINDINFO-NEXT: DW_CFA_restore: reg29 + +name: test_allocate_split_sve +stack: + - { id: 0, stack-id: scalable-vector, size: 18, alignment: 2 } + - { id: 1, stack-id: scalable-vector, size: 12, alignment: 2 } + - { id: 2, stack-id: default, size: 16, alignment: 8 } +body: | + bb.0.entry: + liveins: $z0, $p0 + STR_ZXI $z0, %stack.0, 0 :: (store (<vscale x 1 x s128>) into %stack.0) + STR_PXI $p0, %stack.1, 0 :: (store (<vscale x 1 x s16>) into %stack.1) + RET_ReallyLR +... +--- + +# Stack realignment is not supported with split-sve-objects, so we fallback to +# the default hazard padding implementation. This does not prevent hazards +# between ZPRs and PPRs (TODO: support this case). +# +# +----------+ +# | lr, fp | // frame record +# |----------| +# |//////////| // hazard padding (1024 bytes) +# |----------| +# | %stack.0 | // scalable predicate of n * 12 bytes, aligned to 16 bytes +# | | // to be materialized with 1*ADDVL (<=> n * 16 bytes) +# +----------+ +# | %stack.0 | // scalable SVE object of n * 18 bytes, aligned to 16 bytes, +# | | // to be materialized with 2*ADDVL (<=> 2 * n * 16 bytes) +# +----------+ +# |//////////| // hazard padding (1024 bytes) +# |----------| +# | %stack.1 | // not scalable +# +----------+ <- SP + +name: test_allocate_split_sve_realigned +stack: + - { id: 0, stack-id: scalable-vector, size: 18, alignment: 2 } + - { id: 1, stack-id: scalable-vector, size: 12, alignment: 2 } + - { id: 2, stack-id: default, size: 16, alignment: 32 } +body: | + bb.0.entry: + liveins: $z0, $p0 + STR_ZXI $z0, %stack.0, 0 :: (store (<vscale x 1 x s128>) into %stack.0) + STR_PXI $p0, %stack.1, 0 :: (store (<vscale x 1 x s16>) into %stack.1) + RET_ReallyLR + +# CHECK-LABEL: name: test_allocate_split_sve_realigned +# CHECK: stackSize: 2080 + +# CHECK: bb.0.entry: +# CHECK: liveins: $z0, $p0, $lr +# CHECK: $sp = frame-setup SUBXri $sp, 1040, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040 +# CHECK-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.5) +# CHECK-NEXT: frame-setup STRXui killed $lr, $sp, 129 :: (store (s64) into %stack.4) +# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 1024, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 +# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 1040, 0 +# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $[[TMP]], -2, implicit $vg +# CHECK-NEXT: $sp = frame-setup ANDXri killed $x9, 7930 +# +# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0 +# CHECK-NEXT: $x8 = ADDPL_XXI $x8, -1, implicit $vg +# CHECK-NEXT: STR_ZXI $z0, killed $x8, -1 :: (store (<vscale x 1 x s128>) into %stack.0) +# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0 +# CHECK-NEXT: STR_PXI $p0, killed $x8, -15 :: (store (<vscale x 1 x s16>) into %stack.1) +# +# CHECK-NEXT: $sp = frame-destroy SUBXri $fp, 1024, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1040 +# CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 129 :: (load (s64) from %stack.4) +# CHECK-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.5) +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29 +# CHECK-NEXT: RET_ReallyLR + +# ASM-LABEL: test_allocate_split_sve_realigned +# ASM: sub sp, sp, #1040 +# ASM-NEXT: .cfi_def_cfa_offset 1040 +# ASM-NEXT: str x29, [sp, #1024] +# ASM-NEXT: str x30, [sp, #1032] +# ASM-NEXT: add x29, sp, #1024 +# ASM-NEXT: .cfi_def_cfa w29, 16 +# ASM-NEXT: .cfi_offset w30, -8 +# ASM-NEXT: .cfi_offset w29, -16 +# +# ASM: sub sp, x29, #1024 +# ASM-NEXT: .cfi_def_cfa wsp, 1040 +# ASM-NEXT: ldr x30, [sp, #1032] +# ASM-NEXT: ldr x29, [sp, #1024] +# ASM-NEXT: add sp, sp, #1040 +# ASM-NEXT: .cfi_def_cfa_offset 0 +# ASM-NEXT: .cfi_restore w30 +# ASM-NEXT: .cfi_restore w29 + +# UNWINDINFO: DW_CFA_def_cfa_offset: +1040 +# UNWINDINFO: DW_CFA_def_cfa: reg29 +16 +# UNWINDINFO-NEXT: DW_CFA_offset: reg30 -8 +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 +# +# UNWINDINFO: DW_CFA_def_cfa: reg31 +1040 +# UNWINDINFO: DW_CFA_def_cfa_offset: +0 +# UNWINDINFO-NEXT: DW_CFA_restore: reg30 +# UNWINDINFO-NEXT: DW_CFA_restore: reg29 +... +--- + +# +----------+ +# |scratchreg| // x29 is used as scratch reg. +# +----------+ +# | %stack.2 | // scalable predicate @ SP + 2064b + 46 scalable bytes +# |----------| +# |//////////| // hazard padding (1024 bytes) -- part of PPR locals area +# |//////////| // Note: This is currently not included in the "stackSize" +# |----------| +# | %stack.0 | // scalable vector @ SP + 1040b + 16 scalable bytes +# | %stack.1 | // scalable vector @ SP + 1040b +# +----------+ +# |//////////| // hazard padding (1024 bytes) +# |----------| +# | %stack.3 | // not scalable +# +----------+ <- SP + +# CHECK-LABEL: name: test_address_split_sve +# CHECK: stackSize: 1056 + +# CHECK: bb.0.entry: +# CHECK-NEXT: liveins: +# CHECK-NEXT: {{ $}} +# CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.5) +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 +# +# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 1040, 0 +# CHECK-NEXT: STR_ZXI $z0, killed $[[TMP]], 1 +# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 1040, 0 +# CHECK-NEXT: STR_ZXI $z1, killed $[[TMP]], 0 +# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 2064, 0 +# CHECK-NEXT: STR_PXI $p0, killed $[[TMP]], 23 +# +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1056 +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16 +# CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.5) +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29 +# CHECK-NEXT: RET_ReallyLR + +# ASM-LABEL: test_address_split_sve +# ASM: str x29, [sp, #-16]! +# ASM-NEXT: .cfi_def_cfa_offset 16 +# ASM-NEXT: .cfi_offset w29, -16 +# ASM-NEXT: sub sp, sp, #1024 +# ASM-NEXT: .cfi_def_cfa_offset 1040 +# ASM-NEXT: addvl sp, sp, #-1 +# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG +# ASM-NEXT: sub sp, sp, #1040 +# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG +# ASM-NEXT: addvl sp, sp, #-2 +# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 2080 + 24 * VG +# +# ASM: addvl sp, sp, #2 +# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG +# ASM-NEXT: add sp, sp, #1024 +# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1056 + 8 * VG +# ASM-NEXT: addvl sp, sp, #1 +# ASM-NEXT: .cfi_def_cfa wsp, 1056 +# ASM-NEXT: add sp, sp, #1040 +# ASM-NEXT: .cfi_def_cfa_offset 16 +# ASM-NEXT: ldr x29, [sp], #16 +# ASM-NEXT: .cfi_def_cfa_offset 0 +# ASM-NEXT: .cfi_restore w29 + +# UNWINDINFO: DW_CFA_def_cfa_offset: +16 +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 +# UNWINDINFO: DW_CFA_def_cfa_offset: +1040 +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus +# +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1056, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa: reg31 +1056 +# UNWINDINFO: DW_CFA_def_cfa_offset: +16 +# UNWINDINFO: DW_CFA_def_cfa_offset: +0 +# UNWINDINFO-NEXT: DW_CFA_restore: reg29 + +name: test_address_split_sve +frameInfo: + maxAlignment: 16 +stack: + - { id: 0, stack-id: scalable-vector, size: 16, alignment: 8 } + - { id: 1, stack-id: scalable-vector, size: 16, alignment: 8 } + - { id: 2, stack-id: scalable-vector, size: 2, alignment: 2 } + - { id: 3, stack-id: default, size: 16, alignment: 8 } +body: | + bb.0.entry: + liveins: $z0, $z1, $p0 + + STR_ZXI $z0, %stack.0, 0 :: (store (<vscale x 1 x s128>) into %stack.0) + STR_ZXI $z1, %stack.1, 0 :: (store (<vscale x 1 x s128>) into %stack.1) + STR_PXI $p0, %stack.2, 0 :: (store (<vscale x 1 x s16>) into %stack.2) + + RET_ReallyLR +... +--- +# +----------+ +# | lr, fp | // frame record +# +----------+ <- FP +# | %stack.2 | // scalable predicate @ FP - 2 scalable bytes +# |----------| +# |//////////| // hazard padding (1024 bytes) -- part of PPR locals area +# |//////////| // Note: This is currently not included in the "stackSize" +# |----------| +# | %stack.0 | // scalable vector @ FP - 1024b - 32 scalable bytes +# | %stack.1 | // scalable vector @ FP - 1024b - 48 scalable bytes +# +----------+ +# |//////////| // hazard padding (1024 bytes) +# |----------| +# | %stack.3 | // not scalable +# +----------+ <- SP + +# CHECK-LABEL: name: test_address_split_sve_fp +# CHECK: stackSize: 1056 +# +# CHECK: bb.0.entry: +# CHECK-NEXT: liveins: +# CHECK-NEXT: {{ $}} +# CHECK-NEXT: early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.6), (store (s64) into %stack.5) +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 +# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg +# +# CHECK-NEXT: $[[TMP:x[0-9]+]] = SUBXri $fp, 1024, 0 +# CHECK-NEXT: STR_ZXI $z0, killed $[[TMP]], -2 +# CHECK-NEXT: $[[TMP:x[0-9]+]] = SUBXri $fp, 1024, 0 +# CHECK-NEXT: STR_ZXI $z1, killed $[[TMP]], -3 +# CHECK-NEXT: STR_PXI $p0, $fp, -1 +# +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0 +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16 +# CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.6), (load (s64) from %stack.5) +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29 +# CHECK-NEXT: RET_ReallyLR + +# ASM-LABEL: test_address_split_sve_fp +# ASM: stp x29, x30, [sp, #-16]! +# ASM-NEXT: .cfi_def_cfa_offset 16 +# ASM-NEXT: mov x29, sp +# ASM-NEXT: .cfi_def_cfa w29, 16 +# ASM-NEXT: .cfi_offset w30, -8 +# ASM-NEXT: .cfi_offset w29, -16 +# ASM-NEXT: sub sp, sp, #1024 +# ASM-NEXT: addvl sp, sp, #-1 +# ASM-NEXT: sub sp, sp, #1040 +# ASM-NEXT: addvl sp, sp, #-2 +# +# ASM: addvl sp, sp, #2 +# ASM-NEXT: add sp, sp, #1024 +# ASM-NEXT: addvl sp, sp, #1 +# ASM-NEXT: add sp, sp, #1040 +# ASM-NEXT: .cfi_def_cfa wsp, 16 +# ASM-NEXT: ldp x29, x30, [sp], #16 +# ASM-NEXT: .cfi_def_cfa_offset 0 +# ASM-NEXT: .cfi_restore w30 +# ASM-NEXT: .cfi_restore w29 + +# UNWINDINFO: DW_CFA_def_cfa_offset: +16 +# UNWINDINFO: DW_CFA_def_cfa: reg29 +16 +# UNWINDINFO-NEXT: DW_CFA_offset: reg30 -8 +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 +# +# UNWINDINFO: DW_CFA_def_cfa: reg31 +16 +# UNWINDINFO: DW_CFA_def_cfa_offset: +0 +# UNWINDINFO-NEXT: DW_CFA_restore: reg30 +# UNWINDINFO-NEXT: DW_CFA_restore: reg29 + +name: test_address_split_sve_fp +frameInfo: + maxAlignment: 16 + isFrameAddressTaken: true +stack: + - { id: 0, stack-id: scalable-vector, size: 16, alignment: 8 } + - { id: 1, stack-id: scalable-vector, size: 16, alignment: 8 } + - { id: 2, stack-id: scalable-vector, size: 2, alignment: 2 } + - { id: 3, stack-id: default, size: 16, alignment: 8 } +body: | + bb.0.entry: + liveins: $z0, $z1, $p0 + + STR_ZXI $z0, %stack.0, 0 :: (store (<vscale x 1 x s128>) into %stack.0) + STR_ZXI $z1, %stack.1, 0 :: (store (<vscale x 1 x s128>) into %stack.1) + STR_PXI $p0, %stack.2, 0 :: (store (<vscale x 1 x s16>) into %stack.2) + + RET_ReallyLR +... +--- +# CHECK-LABEL: name: save_restore_ppr_zpr +# CHECK: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.8) +# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16 +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 +# CHECK-NEXT: frame-setup STR_PXI killed $p6, $sp, 5 :: (store (s16) into %stack.7) +# CHECK-NEXT: frame-setup STR_PXI killed $p5, $sp, 6 :: (store (s16) into %stack.6) +# CHECK-NEXT: frame-setup STR_PXI killed $p4, $sp, 7 :: (store (s16) into %stack.5) +# +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 +# +# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3, implicit $vg +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0a, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 +# CHECK-NEXT: frame-setup STR_ZXI killed $z10, $sp, 0 :: (store (s128) into %stack.4) +# CHECK-NEXT: frame-setup STR_ZXI killed $z9, $sp, 1 :: (store (s128) into %stack.3) +# CHECK-NEXT: frame-setup STR_ZXI killed $z8, $sp, 2 :: (store (s128) into %stack.2) +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 +# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1056, 0 +# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0a, 0x8f, 0xb0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 +# +# +# CHECK: $sp = frame-destroy ADDXri $sp, 1056, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0a, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 +# CHECK-NEXT: $z10 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.4) +# CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.3) +# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.2) +# +# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 +# +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3, implicit $vg +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z9 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z10 +# CHECK-NEXT: $p6 = frame-destroy LDR_PXI $sp, 5 :: (load (s16) from %stack.7) +# CHECK-NEXT: $p5 = frame-destroy LDR_PXI $sp, 6 :: (load (s16) from %stack.6) +# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 7 :: (load (s16) from %stack.5) +# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16 +# CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.8) +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0 +# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29 +# CHECK-NEXT: RET_ReallyLR + +# ASM-LABEL: save_restore_ppr_zpr: +# ASM: str x29, [sp, #-16]! +# ASM-NEXT: .cfi_def_cfa_offset 16 +# ASM-NEXT: .cfi_offset w29, -16 +# ASM-NEXT: addvl sp, sp, #-1 +# ASM-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG +# ASM-NEXT: str p6, [sp, #5, mul vl] +# ASM-NEXT: str p5, [sp, #6, mul vl] +# ASM-NEXT: str p4, [sp, #7, mul vl] +# ASM-NEXT: sub sp, sp, #1024 +# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG +# ASM-NEXT: addvl sp, sp, #-3 +# ASM-NEXT: .cfi_escape 0x0f, 0x0a, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 // sp + 1040 + 32 * VG +# ASM-NEXT: str z10, [sp] +# ASM-NEXT: str z9, [sp, #1, mul vl] +# ASM-NEXT: str z8, [sp, #2, mul vl] +# ASM-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1040 +# ASM-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d9 @ cfa - 24 * VG - 1040 +# ASM-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d10 @ cfa - 32 * VG - 1040 +# ASM-NEXT: sub sp, sp, #1056 +# ASM-NEXT: .cfi_escape 0x0f, 0x0a, 0x8f, 0xb0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 // sp + 2096 + 32 * VG +# +# ASM: add sp, sp, #1056 +# ASM-NEXT: .cfi_escape 0x0f, 0x0a, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 // sp + 1040 + 32 * VG +# ASM-NEXT: ldr z10, [sp] +# ASM-NEXT: ldr z9, [sp, #1, mul vl] +# ASM-NEXT: ldr z8, [sp, #2, mul vl] +# ASM-NEXT: add sp, sp, #1024 +# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 // sp + 16 + 32 * VG +# ASM-NEXT: addvl sp, sp, #3 +# ASM-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG +# ASM-NEXT: .cfi_restore z8 +# ASM-NEXT: .cfi_restore z9 +# ASM-NEXT: .cfi_restore z10 +# ASM-NEXT: ldr p6, [sp, #5, mul vl] +# ASM-NEXT: ldr p5, [sp, #6, mul vl] +# ASM-NEXT: ldr p4, [sp, #7, mul vl] +# ASM-NEXT: addvl sp, sp, #1 +# ASM-NEXT: .cfi_def_cfa wsp, 16 +# ASM-NEXT: ldr x29, [sp], #16 +# ASM-NEXT: .cfi_def_cfa_offset 0 +# ASM-NEXT: .cfi_restore w29 + +# UNWINDINFO: DW_CFA_def_cfa_offset: +16 +# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16 +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_consts +32, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_expression: reg72 DW_OP_bregx 0x2e +0, DW_OP_consts -16, DW_OP_mul, DW_OP_plus, DW_OP_consts -1040, DW_OP_plus +# UNWINDINFO: DW_CFA_expression: reg73 DW_OP_bregx 0x2e +0, DW_OP_consts -24, DW_OP_mul, DW_OP_plus, DW_OP_consts -1040, DW_OP_plus +# UNWINDINFO: DW_CFA_expression: reg74 DW_OP_bregx 0x2e +0, DW_OP_consts -32, DW_OP_mul, DW_OP_plus, DW_OP_consts -1040, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2096, DW_OP_bregx 0x2e +0, DW_OP_consts +32, DW_OP_mul, DW_OP_plus +# +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_consts +32, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_consts +32, DW_OP_mul, DW_OP_plus +# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus +# UNWINDINFO-NEXT: DW_CFA_restore_extended: reg104 +# UNWINDINFO-NEXT: DW_CFA_restore_extended: reg105 +# UNWINDINFO-NEXT: DW_CFA_restore_extended: reg106 +# UNWINDINFO: DW_CFA_def_cfa: reg31 +16 +# UNWINDINFO: DW_CFA_def_cfa_offset: +0 +# UNWINDINFO-NEXT: DW_CFA_restore: reg29 + +name: save_restore_ppr_zpr +stack: + - { id: 0, stack-id: default, size: 32, alignment: 16 } +body: | + bb.0.entry: + + $p4 = IMPLICIT_DEF + $p5 = IMPLICIT_DEF + $p6 = IMPLICIT_DEF + $z8 = IMPLICIT_DEF + $z9 = IMPLICIT_DEF + $z10 = IMPLICIT_DEF + + RET_ReallyLR diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir index 03a6aab..1101416 100644 --- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir +++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir @@ -1215,19 +1215,19 @@ body: | # CHECK: - { id: 2, name: '', type: default, offset: -112, size: 16, alignment: 16, # CHECK-NEXT: stack-id: scalable-vector, # CHECK: - { id: 3, name: '', type: default, offset: -114, size: 2, alignment: 2, -# CHECK-NEXT: stack-id: scalable-vector, +# CHECK-NEXT: stack-id: scalable-predicate-vector, # CHECK: - { id: 4, name: '', type: spill-slot, offset: -144, size: 16, alignment: 16, # CHECK-NEXT: stack-id: scalable-vector, # CHECK: - { id: 5, name: '', type: spill-slot, offset: -146, size: 2, alignment: 2, -# CHECK-NEXT: stack-id: scalable-vector, +# CHECK-NEXT: stack-id: scalable-predicate-vector, # CHECK: - { id: 6, name: '', type: spill-slot, offset: -16, size: 16, alignment: 16, # CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '$z8', # CHECK: - { id: 7, name: '', type: spill-slot, offset: -32, size: 16, alignment: 16, # CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '$z23', # CHECK: - { id: 8, name: '', type: spill-slot, offset: -34, size: 2, alignment: 2, -# CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '$p4', +# CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: '$p4', # CHECK: - { id: 9, name: '', type: spill-slot, offset: -36, size: 2, alignment: 2, -# CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '$p15', +# CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: '$p15', # CHECK: - { id: 10, name: '', type: spill-slot, offset: -16, size: 8, alignment: 16, # CHECK-NEXT: stack-id: default, callee-saved-register: '$fp', # @@ -1295,9 +1295,9 @@ stack: - { id: 0, type: default, size: 32, alignment: 16, stack-id: scalable-vector } - { id: 1, type: default, size: 4, alignment: 2, stack-id: scalable-vector } - { id: 2, type: default, size: 16, alignment: 16, stack-id: scalable-vector } - - { id: 3, type: default, size: 2, alignment: 2, stack-id: scalable-vector } + - { id: 3, type: default, size: 2, alignment: 2, stack-id: scalable-predicate-vector } - { id: 4, type: spill-slot, size: 16, alignment: 16, stack-id: scalable-vector } - - { id: 5, type: spill-slot, size: 2, alignment: 2, stack-id: scalable-vector } + - { id: 5, type: spill-slot, size: 2, alignment: 2, stack-id: scalable-predicate-vector } body: | bb.0.entry: diff --git a/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir b/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir index bff0cac..0298168 100644 --- a/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir +++ b/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir @@ -983,26 +983,22 @@ body: | ; EXPAND-LABEL: name: zpr_predicate_spill_p4_saved ; EXPAND: liveins: $p0, $p1, $p2, $p3, $fp, $p8, $p4 ; EXPAND-NEXT: {{ $}} - ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0 - ; EXPAND-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.3) + ; EXPAND-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.2) ; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0 - ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.2) + ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.1) ; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0 - ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.1) - ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0 + ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.0) ; ; EXPAND-NEXT: $p8 = IMPLICIT_DEF ; - ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0 - ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.2) + ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.1) ; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg ; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv - ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.1) + ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.0) ; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv ; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg - ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.3) - ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0 + ; EXPAND-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.2) ; EXPAND-NEXT: RET undef $lr, implicit $p0, implicit $p1, implicit $p2, implicit $p3 ; If we spill a register above p8, p4 must also be saved, so we can guarantee diff --git a/llvm/test/CodeGen/AArch64/spillfill-sve.mir b/llvm/test/CodeGen/AArch64/spillfill-sve.mir index 2b16dd0f..5569175 100644 --- a/llvm/test/CodeGen/AArch64/spillfill-sve.mir +++ b/llvm/test/CodeGen/AArch64/spillfill-sve.mir @@ -39,7 +39,7 @@ body: | ; CHECK-LABEL: name: spills_fills_stack_id_ppr ; CHECK: stack: ; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2 - ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '' + ; CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: '' ; EXPAND-LABEL: name: spills_fills_stack_id_ppr ; EXPAND: STR_PXI $p0, $sp, 7 @@ -82,7 +82,7 @@ body: | ; CHECK-LABEL: name: spills_fills_stack_id_ppr2 ; CHECK: stack: ; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 2 - ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '' + ; CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: '' ; EXPAND-LABEL: name: spills_fills_stack_id_ppr2 ; EXPAND: STR_PXI $p0, $sp, 6 @@ -127,7 +127,7 @@ body: | ; CHECK-LABEL: name: spills_fills_stack_id_ppr2 ; CHECK: stack: ; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 2 - ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '' + ; CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: '' ; EXPAND-LABEL: name: spills_fills_stack_id_ppr2mul2 ; EXPAND: STR_PXI $p0, $sp, 6 @@ -172,7 +172,7 @@ body: | ; CHECK-LABEL: name: spills_fills_stack_id_pnr ; CHECK: stack: ; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2 - ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '' + ; CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: '' ; EXPAND-LABEL: name: spills_fills_stack_id_pnr ; EXPAND: STR_PXI $pn0, $sp, 7 @@ -211,7 +211,7 @@ body: | ; CHECK-LABEL: name: spills_fills_stack_id_virtreg_pnr ; CHECK: stack: ; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2 - ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '' + ; CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: '' ; EXPAND-LABEL: name: spills_fills_stack_id_virtreg_pnr ; EXPAND: renamable $pn8 = WHILEGE_CXX_B diff --git a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll new file mode 100644 index 0000000..690a39d --- /dev/null +++ b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll @@ -0,0 +1,824 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 | FileCheck %s +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -pass-remarks-analysis=stack-frame-layout 2>&1 >/dev/null | FileCheck %s --check-prefixes=CHECK-FRAMELAYOUT + +; CHECK-FRAMELAYOUT-LABEL: Function: zpr_and_ppr_local +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Variable, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024 + +; <GPRs> +; %ppr_local sp+2048+30*vscale (= #15, mul vl for str/ldr PPR) +; 14 x vscale bytes of padding sp+2048+16*vscale +; <hazard padding> sp+1024+16*vscale +; %zpr_local sp+1024 +; <hazard padding> +; -> sp +define void @zpr_and_ppr_local(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: zpr_and_ppr_local: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2064 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: add x8, sp, #2048 +; CHECK-NEXT: str p0, [x8, #15, mul vl] +; CHECK-NEXT: add x8, sp, #1024 +; CHECK-NEXT: str z0, [x8] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %ppr_local = alloca <vscale x 16 x i1> + %zpr_local = alloca <vscale x 16 x i8> + store volatile <vscale x 16 x i1> %pred, ptr %ppr_local + store volatile <vscale x 16 x i8> %vector, ptr %zpr_local + ret void +} + +; CHECK-FRAMELAYOUT-LABEL: Function: zpr_and_ppr_local_fp +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Variable, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024 + +; <GPRs> +; -> fp +; %ppr_local fp-2*vscale (= #-1, mul vl for str/ldr PPR) +; 14 x vscale bytes of padding fp-16*vscale +; <hazard padding> fp-1024-16*vscale +; %zpr_local fp-1024-32*vscale (= #-2, mul vl for str/ldr ZPR) +; <hazard padding> +; -> sp +define void @zpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) "aarch64_pstate_sm_compatible" "frame-pointer"="all" { +; CHECK-LABEL: zpr_and_ppr_local_fp: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x8, x29, #1024 +; CHECK-NEXT: str p0, [x29, #-1, mul vl] +; CHECK-NEXT: str z0, [x8, #-2, mul vl] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + %ppr_local = alloca <vscale x 16 x i1> + %zpr_local = alloca <vscale x 16 x i8> + store volatile <vscale x 16 x i1> %pred, ptr %ppr_local + store volatile <vscale x 16 x i8> %vector, ptr %zpr_local + ret void +} + +; CHECK-FRAMELAYOUT-LABEL: Function: fpr_and_ppr_local +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1048-16 x vscale], Type: Variable, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2080-16 x vscale], Type: Variable, Align: 16, Size: 1024 + +; <GPRs> +; %ppr_local sp+2064+14*vscale (= #7, mul vl for str/ldr PPR) +; 14 x vscale bytes of padding sp+2064 +; <hazard padding> sp+1040 +; %fpr_local sp+1032 +; 8 bytes of padding sp+1024 +; <hazard padding> +; -> sp +define void @fpr_and_ppr_local(<vscale x 16 x i1> %pred, double %double) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: fpr_and_ppr_local: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: add x8, sp, #2064 +; CHECK-NEXT: str p0, [x8, #7, mul vl] +; CHECK-NEXT: str d0, [sp, #1032] +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %ppr_local = alloca <vscale x 16 x i1> + %fpr_local = alloca double + store volatile <vscale x 16 x i1> %pred, ptr %ppr_local + store volatile double %double, ptr %fpr_local + ret void +} + +; CHECK-FRAMELAYOUT-LABEL: Function: fpr_and_ppr_local_fp +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1048-16 x vscale], Type: Variable, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2080-16 x vscale], Type: Variable, Align: 16, Size: 1024 + +; <GPRs> +; -> fp +; %ppr_local fp-2*vscale (= #-1, mul vl for str/ldr PPR) +; 14 x vscale bytes of padding +; <hazard padding> +; %fpr_local sp+1032 +; 8 bytes of padding sp+1024 +; <hazard padding> +; -> sp +define void @fpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, double %double) "aarch64_pstate_sm_compatible" "frame-pointer"="all" { +; CHECK-LABEL: fpr_and_ppr_local_fp: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: str p0, [x29, #-1, mul vl] +; CHECK-NEXT: str d0, [sp, #1032] +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + %ppr_local = alloca <vscale x 16 x i1> + %fpr_local = alloca double + store volatile <vscale x 16 x i1> %pred, ptr %ppr_local + store volatile double %double, ptr %fpr_local + ret void +} + +; CHECK-FRAMELAYOUT-LABEL: Function: gpr_and_ppr_local +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2072-32 x vscale], Type: Variable, Align: 8, Size: 8 + +; <CS GPRs> +; %ppr_local sp+2064+30*vscale (= #15, mul vl for str/ldr PPR) +; 14 x vscale bytes of padding +; <hazard padding> sp+1040+16*vscale +; <fpr callee save: z8> sp+1040 +; <hazard padding> sp+16 +; %gpr_local sp+8 +; 8 bytes of padding +; -> sp +define void @gpr_and_ppr_local(<vscale x 16 x i1> %pred, i64 %int) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: gpr_and_ppr_local: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2080 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1040 +; CHECK-NEXT: add x8, sp, #2064 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: str p0, [x8, #15, mul vl] +; CHECK-NEXT: str x0, [sp, #8] +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + tail call void asm sideeffect "", "~{d8}"() #1 ; Spill an FPR so hazard padding is needed + %ppr_local = alloca <vscale x 16 x i1> + %gpr_local = alloca i64 + store volatile <vscale x 16 x i1> %pred, ptr %ppr_local + store volatile i64 %int, ptr %gpr_local + ret void +} + +; CHECK-FRAMELAYOUT-LABEL: Function: gpr_and_ppr_local_fp +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2072-32 x vscale], Type: Variable, Align: 8, Size: 8 + +; <CS GPRs> +; -> fp +; %ppr_local fp-2*vscale (= #-1, mul vl for str/ldr PPR) +; 14 x vscale bytes of padding +; <hazard padding> +; <fpr callee save: z8> +; <hazard padding> +; %gpr_local sp+8 +; 8 bytes of padding +; -> sp +define void @gpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, i64 %int) "aarch64_pstate_sm_compatible" "frame-pointer"="all" { +; CHECK-LABEL: gpr_and_ppr_local_fp: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1040 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: str p0, [x29, #-1, mul vl] +; CHECK-NEXT: str x0, [sp, #8] +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + tail call void asm sideeffect "", "~{d8}"() #1 ; Spill an FPR so hazard padding is needed + %ppr_local = alloca <vscale x 16 x i1> + %gpr_local = alloca i64 + store volatile <vscale x 16 x i1> %pred, ptr %ppr_local + store volatile i64 %int, ptr %gpr_local + ret void +} + +; CHECK-FRAMELAYOUT-LABEL: Function: all_stack_areas +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-4 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-6 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-8 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-10 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-12 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-14 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-16 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-18 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-20 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-22 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-24 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-34 x vscale], Type: Variable, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-64 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-80 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-96 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-112 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-128 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-144 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-160 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-176 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-192 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-208 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-224 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-240 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-256 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-272 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-288 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-304 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-320 x vscale], Type: Variable, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1048-320 x vscale], Type: Variable, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2080-320 x vscale], Type: Variable, Align: 16, Size: 1024 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2088-320 x vscale], Type: Variable, Align: 8, Size: 8 + +; <CS GPRs> +; <CS PPRs> +; %ppr_local sp+2080+286*vscale (addvl #17, addpl #7) +; 14 * vscale bytes of padding sp+2080+272*vscale +; <hazard padding> sp+1056+272*vscale +; <CS ZPRs> sp+1056+16*vscale +; %zpr_local sp+1056 +; %fpr_local sp+1048 +; 8 bytes of padding sp+1040 +; <hazard padding> sp+16 +; %gpr_local sp+8 +; 8 bytes of padding sp +; -> sp +define void @all_stack_areas(<vscale x 16 x i1> %pred, double %fp) { +; CHECK-LABEL: all_stack_areas: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-17 +; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1056 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0b, 0x8f, 0xb0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0xa0, 0x01, 0x1e, 0x22 // sp + 2096 + 160 * VG +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 32 * VG - 1040 +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d9 @ cfa - 40 * VG - 1040 +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d10 @ cfa - 48 * VG - 1040 +; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d11 @ cfa - 56 * VG - 1040 +; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d12 @ cfa - 64 * VG - 1040 +; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d13 @ cfa - 72 * VG - 1040 +; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d14 @ cfa - 80 * VG - 1040 +; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xa8, 0x7f, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d15 @ cfa - 88 * VG - 1040 +; CHECK-NEXT: add x0, sp, #2080 +; CHECK-NEXT: add x8, sp, #2080 +; CHECK-NEXT: add x1, sp, #1056 +; CHECK-NEXT: addvl x0, x0, #17 +; CHECK-NEXT: add x2, sp, #1048 +; CHECK-NEXT: add x3, sp, #8 +; CHECK-NEXT: addpl x0, x0, #7 +; CHECK-NEXT: str d0, [sp, #1048] +; CHECK-NEXT: str p0, [x8, #143, mul vl] +; CHECK-NEXT: bl foo +; CHECK-NEXT: add sp, sp, #1056 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret + %ppr_local = alloca <vscale x 16 x i1> + %zpr_local = alloca <vscale x 16 x i8> + %fpr_local = alloca double + ; // Needed to sort %fpr_local into the FPR region + store double %fp, ptr %fpr_local + ; // Needed to sort %ppr_local into the PPR region + store <vscale x 16 x i1> %pred, ptr %ppr_local + %gpr_local = alloca i64 + call void @foo(ptr %ppr_local, ptr %zpr_local, ptr %fpr_local, ptr %gpr_local) + ret void +} +declare void @foo(ptr, ptr, ptr, ptr) + +; CHECK-FRAMELAYOUT-LABEL: Function: all_stack_areas_fp +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-24], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-2 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-4 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-6 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-8 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-10 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-12 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-14 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-16 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-18 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-20 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-22 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-24 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-34 x vscale], Type: Variable, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-64 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-80 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-96 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-112 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-128 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-144 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-160 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-176 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-192 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-208 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-224 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-240 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-256 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-272 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-288 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-304 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-320 x vscale], Type: Variable, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1064-320 x vscale], Type: Variable, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2096-320 x vscale], Type: Variable, Align: 16, Size: 1024 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2104-320 x vscale], Type: Variable, Align: 8, Size: 8 + +; <CS GPRs> +; -> fp +; <CS PPRs> fp-32*vscale +; %ppr_local fp-34*vscale (addpl #-17) +; 14 * vscale bytes of padding fp-48*vscale +; <hazard padding> fp-1024-48*vscale +; <CS ZPRs> fp-1024-304*vscale +; %zpr_local sp-1024-320*vscale (addvl #-20) +; %fpr_local sp+1048 +; 8 bytes of padding sp+1040 +; <hazard padding> sp+16 +; %gpr_local sp+8 +; 8 bytes of padding sp +; -> sp +define void @all_stack_areas_fp(<vscale x 16 x i1> %pred, double %fp) "frame-pointer"="all" { +; CHECK-LABEL: all_stack_areas_fp: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill +; CHECK-NEXT: str x28, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-17 +; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1056 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: .cfi_def_cfa w29, 32 +; CHECK-NEXT: .cfi_offset w28, -16 +; CHECK-NEXT: .cfi_offset w30, -24 +; CHECK-NEXT: .cfi_offset w29, -32 +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 32 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 40 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 48 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d11 @ cfa - 56 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d12 @ cfa - 64 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d13 @ cfa - 72 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d14 @ cfa - 80 * VG - 1056 +; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xa8, 0x7f, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d15 @ cfa - 88 * VG - 1056 +; CHECK-NEXT: sub x1, x29, #1024 +; CHECK-NEXT: addpl x0, x29, #-17 +; CHECK-NEXT: add x2, sp, #1048 +; CHECK-NEXT: addvl x1, x1, #-20 +; CHECK-NEXT: add x3, sp, #8 +; CHECK-NEXT: str d0, [sp, #1048] +; CHECK-NEXT: str p0, [x29, #-17, mul vl] +; CHECK-NEXT: bl foo +; CHECK-NEXT: add sp, sp, #1056 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #17 +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x28, [sp, #16] // 8-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload +; CHECK-NEXT: ret + %ppr_local = alloca <vscale x 16 x i1> + %zpr_local = alloca <vscale x 16 x i8> + %fpr_local = alloca double + ; // Needed to sort %fpr_local into the FPR region + store double %fp, ptr %fpr_local + ; // Needed to sort %ppr_local into the PPR region + store <vscale x 16 x i1> %pred, ptr %ppr_local + %gpr_local = alloca i64 + call void @foo(ptr %ppr_local, ptr %zpr_local, ptr %fpr_local, ptr %gpr_local) + ret void +} + +; CHECK-FRAMELAYOUT-LABEL: Function: svecc_call +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-24], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48], Type: Spill, Align: 16, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-56], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64], Type: Spill, Align: 8, Size: 8 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-2 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-4 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-6 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-8 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-10 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-12 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-14 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-16 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-18 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-20 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-22 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-24 x vscale], Type: Spill, Align: 2, Size: vscale x 2 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-48 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-64 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-80 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-96 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-112 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-128 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-144 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-160 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-176 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-192 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-208 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-224 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-240 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-256 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-272 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-288 x vscale], Type: Spill, Align: 16, Size: vscale x 16 +; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2112-288 x vscale], Type: Variable, Align: 16, Size: 1024 + +define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: svecc_call: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK-NEXT: .cfi_def_cfa_offset 64 +; CHECK-NEXT: cntd x9 +; CHECK-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill +; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; CHECK-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill +; CHECK-NEXT: mov x29, sp +; CHECK-NEXT: .cfi_def_cfa w29, 64 +; CHECK-NEXT: .cfi_offset w19, -8 +; CHECK-NEXT: .cfi_offset w26, -16 +; CHECK-NEXT: .cfi_offset w27, -24 +; CHECK-NEXT: .cfi_offset w28, -32 +; CHECK-NEXT: .cfi_offset vg, -48 +; CHECK-NEXT: .cfi_offset w30, -56 +; CHECK-NEXT: .cfi_offset w29, -64 +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-16 +; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088 +; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088 +; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088 +; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088 +; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088 +; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088 +; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088 +; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088 +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: mov x8, x0 +; CHECK-NEXT: bl __arm_sme_state +; CHECK-NEXT: mov x19, x0 +; CHECK-NEXT: //APP +; CHECK-NEXT: //NO_APP +; CHECK-NEXT: tbz w19, #0, .LBB8_2 +; CHECK-NEXT: // %bb.1: // %entry +; CHECK-NEXT: smstop sm +; CHECK-NEXT: .LBB8_2: // %entry +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: mov w1, #45 // =0x2d +; CHECK-NEXT: mov w2, #37 // =0x25 +; CHECK-NEXT: bl memset +; CHECK-NEXT: tbz w19, #0, .LBB8_4 +; CHECK-NEXT: // %bb.3: // %entry +; CHECK-NEXT: smstart sm +; CHECK-NEXT: .LBB8_4: // %entry +; CHECK-NEXT: mov w0, #22647 // =0x5877 +; CHECK-NEXT: movk w0, #59491, lsl #16 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #16 +; CHECK-NEXT: .cfi_restore z8 +; CHECK-NEXT: .cfi_restore z9 +; CHECK-NEXT: .cfi_restore z10 +; CHECK-NEXT: .cfi_restore z11 +; CHECK-NEXT: .cfi_restore z12 +; CHECK-NEXT: .cfi_restore z13 +; CHECK-NEXT: .cfi_restore z14 +; CHECK-NEXT: .cfi_restore z15 +; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: .cfi_def_cfa wsp, 64 +; CHECK-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload +; CHECK-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload +; CHECK-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: .cfi_restore w19 +; CHECK-NEXT: .cfi_restore w26 +; CHECK-NEXT: .cfi_restore w27 +; CHECK-NEXT: .cfi_restore w28 +; CHECK-NEXT: .cfi_restore vg +; CHECK-NEXT: .cfi_restore w30 +; CHECK-NEXT: .cfi_restore w29 +; CHECK-NEXT: ret +entry: + tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 + %call = call ptr @memset(ptr noundef nonnull %P1, i32 noundef 45, i32 noundef 37) + ret i32 -396142473 +} +declare ptr @memset(ptr, i32, i32) + +; FIXME: aarch64-split-sve-objects is currently not supported in this function +; as it requires stack reealignment (for the 32-byte aligned alloca). +; GPR CSRs +; <hazard padding> +; FPR CSRs +; <hazrd padding> +; <SVE locals (PPRs and ZPRs)> <--- hazard between PPRs and ZPRs here! +; <realignment padding> +; -> sp +define void @zpr_and_ppr_local_realignment(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector, i64 %gpr) "aarch64_pstate_sm_compatible" { +; CHECK-LABEL: zpr_and_ppr_local_realignment: +; CHECK: // %bb.0: +; CHECK-NEXT: sub sp, sp, #1040 +; CHECK-NEXT: sub x9, sp, #1040 +; CHECK-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK-NEXT: add x29, sp, #1024 +; CHECK-NEXT: addvl x9, x9, #-2 +; CHECK-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill +; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0 +; CHECK-NEXT: .cfi_def_cfa w29, 16 +; CHECK-NEXT: .cfi_offset w30, -8 +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: sub x8, x29, #1024 +; CHECK-NEXT: str p0, [x8, #-1, mul vl] +; CHECK-NEXT: str z0, [x8, #-2, mul vl] +; CHECK-NEXT: str x0, [sp] +; CHECK-NEXT: sub sp, x29, #1024 +; CHECK-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload +; CHECK-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK-NEXT: add sp, sp, #1040 +; CHECK-NEXT: ret + %ppr_local = alloca <vscale x 16 x i1> + %zpr_local = alloca <vscale x 16 x i8> + %gpr_local = alloca i64, align 32 + store volatile <vscale x 16 x i1> %pred, ptr %ppr_local + store volatile <vscale x 16 x i8> %vector, ptr %zpr_local + store volatile i64 %gpr, ptr %gpr_local + ret void +} + +define void @zpr_and_ppr_local_stack_probing(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector, i64 %gpr) +; CHECK-LABEL: zpr_and_ppr_local_stack_probing: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: sub sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: sub sp, sp, #1824 +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str xzr, [sp] +; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xb0, 0x16, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2864 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: add x8, sp, #2848 +; CHECK-NEXT: str p0, [x8, #15, mul vl] +; CHECK-NEXT: add x8, sp, #1824 +; CHECK-NEXT: str z0, [x8] +; CHECK-NEXT: str x0, [sp] +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1024 +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: add sp, sp, #1824 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + "probe-stack"="inline-asm" "stack-probe-size"="4096" "frame-pointer"="none" "aarch64_pstate_sm_compatible" +{ + %ppr_local = alloca <vscale x 16 x i1> + %zpr_local = alloca <vscale x 16 x i8> + %gpr_local = alloca i64, i64 100, align 8 + store volatile <vscale x 16 x i1> %pred, ptr %ppr_local + store volatile <vscale x 16 x i8> %vector, ptr %zpr_local + store volatile i64 %gpr, ptr %gpr_local + ret void +} diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll index 5f52280..333a8be 100644 --- a/llvm/test/CodeGen/AArch64/stack-hazard.ll +++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll @@ -1,7 +1,8 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 ; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=0 | FileCheck %s --check-prefixes=CHECK,CHECK0 ; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=64 | FileCheck %s --check-prefixes=CHECK,CHECK64 -; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024 +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-NOSPLITSVE +; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-split-sve-objects -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-SPLITSVE define i32 @basic(i32 noundef %num) { ; CHECK-LABEL: basic: @@ -1503,72 +1504,24 @@ define [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1([2 x <vscale x 4 x i1> } define [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1_caller([2 x <vscale x 4 x i1>] %arg1, [2 x <vscale x 4 x i1>] %arg2) nounwind "aarch64_pstate_sm_compatible" { -; CHECK0-LABEL: sve_signature_pred_2xv4i1_caller: -; CHECK0: // %bb.0: -; CHECK0-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill -; CHECK0-NEXT: addvl sp, sp, #-1 -; CHECK0-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: mov p5.b, p0.b -; CHECK0-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK0-NEXT: mov p4.b, p1.b -; CHECK0-NEXT: mov p0.b, p2.b -; CHECK0-NEXT: mov p1.b, p3.b -; CHECK0-NEXT: mov p2.b, p5.b -; CHECK0-NEXT: mov p3.b, p4.b -; CHECK0-NEXT: bl sve_signature_pred_2xv4i1 -; CHECK0-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK0-NEXT: addvl sp, sp, #1 -; CHECK0-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload -; CHECK0-NEXT: ret -; -; CHECK64-LABEL: sve_signature_pred_2xv4i1_caller: -; CHECK64: // %bb.0: -; CHECK64-NEXT: sub sp, sp, #80 -; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill -; CHECK64-NEXT: addvl sp, sp, #-1 -; CHECK64-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK64-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK64-NEXT: sub sp, sp, #64 -; CHECK64-NEXT: mov p4.b, p1.b -; CHECK64-NEXT: mov p5.b, p0.b -; CHECK64-NEXT: mov p0.b, p2.b -; CHECK64-NEXT: mov p1.b, p3.b -; CHECK64-NEXT: mov p2.b, p5.b -; CHECK64-NEXT: mov p3.b, p4.b -; CHECK64-NEXT: bl sve_signature_pred_2xv4i1 -; CHECK64-NEXT: add sp, sp, #64 -; CHECK64-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK64-NEXT: addvl sp, sp, #1 -; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload -; CHECK64-NEXT: add sp, sp, #80 -; CHECK64-NEXT: ret -; -; CHECK1024-LABEL: sve_signature_pred_2xv4i1_caller: -; CHECK1024: // %bb.0: -; CHECK1024-NEXT: sub sp, sp, #1040 -; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill -; CHECK1024-NEXT: addvl sp, sp, #-1 -; CHECK1024-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: sub sp, sp, #1024 -; CHECK1024-NEXT: mov p4.b, p1.b -; CHECK1024-NEXT: mov p5.b, p0.b -; CHECK1024-NEXT: mov p0.b, p2.b -; CHECK1024-NEXT: mov p1.b, p3.b -; CHECK1024-NEXT: mov p2.b, p5.b -; CHECK1024-NEXT: mov p3.b, p4.b -; CHECK1024-NEXT: bl sve_signature_pred_2xv4i1 -; CHECK1024-NEXT: add sp, sp, #1024 -; CHECK1024-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: addvl sp, sp, #1 -; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK1024-NEXT: add sp, sp, #1040 -; CHECK1024-NEXT: ret +; CHECK-LABEL: sve_signature_pred_2xv4i1_caller: +; CHECK: // %bb.0: +; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-1 +; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p5.b, p0.b +; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK-NEXT: mov p4.b, p1.b +; CHECK-NEXT: mov p0.b, p2.b +; CHECK-NEXT: mov p1.b, p3.b +; CHECK-NEXT: mov p2.b, p5.b +; CHECK-NEXT: mov p3.b, p4.b +; CHECK-NEXT: bl sve_signature_pred_2xv4i1 +; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK-NEXT: addvl sp, sp, #1 +; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload +; CHECK-NEXT: ret %res = call [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1([2 x <vscale x 4 x i1>] %arg2, [2 x <vscale x 4 x i1>] %arg1) ret [2 x <vscale x 4 x i1>] %res } @@ -2113,139 +2066,269 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, ; CHECK64-NEXT: .cfi_restore w29 ; CHECK64-NEXT: ret ; -; CHECK1024-LABEL: svecc_call: -; CHECK1024: // %bb.0: // %entry -; CHECK1024-NEXT: sub sp, sp, #1088 -; CHECK1024-NEXT: .cfi_def_cfa_offset 1088 -; CHECK1024-NEXT: cntd x9 -; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill -; CHECK1024-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill -; CHECK1024-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill -; CHECK1024-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill -; CHECK1024-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill -; CHECK1024-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill -; CHECK1024-NEXT: add x29, sp, #1024 -; CHECK1024-NEXT: .cfi_def_cfa w29, 64 -; CHECK1024-NEXT: .cfi_offset w19, -16 -; CHECK1024-NEXT: .cfi_offset w26, -24 -; CHECK1024-NEXT: .cfi_offset w27, -32 -; CHECK1024-NEXT: .cfi_offset w28, -40 -; CHECK1024-NEXT: .cfi_offset vg, -48 -; CHECK1024-NEXT: .cfi_offset w30, -56 -; CHECK1024-NEXT: .cfi_offset w29, -64 -; CHECK1024-NEXT: addvl sp, sp, #-18 -; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088 -; CHECK1024-NEXT: sub sp, sp, #1024 -; CHECK1024-NEXT: mov x8, x0 -; CHECK1024-NEXT: bl __arm_sme_state -; CHECK1024-NEXT: mov x19, x0 -; CHECK1024-NEXT: //APP -; CHECK1024-NEXT: //NO_APP -; CHECK1024-NEXT: tbz w19, #0, .LBB28_2 -; CHECK1024-NEXT: // %bb.1: // %entry -; CHECK1024-NEXT: smstop sm -; CHECK1024-NEXT: .LBB28_2: // %entry -; CHECK1024-NEXT: mov x0, x8 -; CHECK1024-NEXT: mov w1, #45 // =0x2d -; CHECK1024-NEXT: mov w2, #37 // =0x25 -; CHECK1024-NEXT: bl memset -; CHECK1024-NEXT: tbz w19, #0, .LBB28_4 -; CHECK1024-NEXT: // %bb.3: // %entry -; CHECK1024-NEXT: smstart sm -; CHECK1024-NEXT: .LBB28_4: // %entry -; CHECK1024-NEXT: mov w0, #22647 // =0x5877 -; CHECK1024-NEXT: movk w0, #59491, lsl #16 -; CHECK1024-NEXT: add sp, sp, #1024 -; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: addvl sp, sp, #18 -; CHECK1024-NEXT: .cfi_restore z8 -; CHECK1024-NEXT: .cfi_restore z9 -; CHECK1024-NEXT: .cfi_restore z10 -; CHECK1024-NEXT: .cfi_restore z11 -; CHECK1024-NEXT: .cfi_restore z12 -; CHECK1024-NEXT: .cfi_restore z13 -; CHECK1024-NEXT: .cfi_restore z14 -; CHECK1024-NEXT: .cfi_restore z15 -; CHECK1024-NEXT: .cfi_def_cfa wsp, 1088 -; CHECK1024-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK1024-NEXT: add sp, sp, #1088 -; CHECK1024-NEXT: .cfi_def_cfa_offset 0 -; CHECK1024-NEXT: .cfi_restore w19 -; CHECK1024-NEXT: .cfi_restore w26 -; CHECK1024-NEXT: .cfi_restore w27 -; CHECK1024-NEXT: .cfi_restore w28 -; CHECK1024-NEXT: .cfi_restore vg -; CHECK1024-NEXT: .cfi_restore w30 -; CHECK1024-NEXT: .cfi_restore w29 -; CHECK1024-NEXT: ret +; CHECK1024-NOSPLITSVE-LABEL: svecc_call: +; CHECK1024-NOSPLITSVE: // %bb.0: // %entry +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 1088 +; CHECK1024-NOSPLITSVE-NEXT: cntd x9 +; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: add x29, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w19, -16 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w26, -24 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w27, -32 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w28, -40 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset vg, -48 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-18 +; CHECK1024-NOSPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: mov x8, x0 +; CHECK1024-NOSPLITSVE-NEXT: bl __arm_sme_state +; CHECK1024-NOSPLITSVE-NEXT: mov x19, x0 +; CHECK1024-NOSPLITSVE-NEXT: //APP +; CHECK1024-NOSPLITSVE-NEXT: //NO_APP +; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB28_2 +; CHECK1024-NOSPLITSVE-NEXT: // %bb.1: // %entry +; CHECK1024-NOSPLITSVE-NEXT: smstop sm +; CHECK1024-NOSPLITSVE-NEXT: .LBB28_2: // %entry +; CHECK1024-NOSPLITSVE-NEXT: mov x0, x8 +; CHECK1024-NOSPLITSVE-NEXT: mov w1, #45 // =0x2d +; CHECK1024-NOSPLITSVE-NEXT: mov w2, #37 // =0x25 +; CHECK1024-NOSPLITSVE-NEXT: bl memset +; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB28_4 +; CHECK1024-NOSPLITSVE-NEXT: // %bb.3: // %entry +; CHECK1024-NOSPLITSVE-NEXT: smstart sm +; CHECK1024-NOSPLITSVE-NEXT: .LBB28_4: // %entry +; CHECK1024-NOSPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-NOSPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #18 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z8 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z9 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z10 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z11 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z12 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z13 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z14 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z15 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa wsp, 1088 +; CHECK1024-NOSPLITSVE-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w19 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w26 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w27 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w28 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore vg +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w30 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w29 +; CHECK1024-NOSPLITSVE-NEXT: ret +; +; CHECK1024-SPLITSVE-LABEL: svecc_call: +; CHECK1024-SPLITSVE: // %bb.0: // %entry +; CHECK1024-SPLITSVE-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 64 +; CHECK1024-SPLITSVE-NEXT: cntd x9 +; CHECK1024-SPLITSVE-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: mov x29, sp +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w19, -8 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w26, -16 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w27, -24 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w28, -32 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset vg, -48 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-2 +; CHECK1024-SPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-16 +; CHECK1024-SPLITSVE-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: mov x8, x0 +; CHECK1024-SPLITSVE-NEXT: bl __arm_sme_state +; CHECK1024-SPLITSVE-NEXT: mov x19, x0 +; CHECK1024-SPLITSVE-NEXT: //APP +; CHECK1024-SPLITSVE-NEXT: //NO_APP +; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB28_2 +; CHECK1024-SPLITSVE-NEXT: // %bb.1: // %entry +; CHECK1024-SPLITSVE-NEXT: smstop sm +; CHECK1024-SPLITSVE-NEXT: .LBB28_2: // %entry +; CHECK1024-SPLITSVE-NEXT: mov x0, x8 +; CHECK1024-SPLITSVE-NEXT: mov w1, #45 // =0x2d +; CHECK1024-SPLITSVE-NEXT: mov w2, #37 // =0x25 +; CHECK1024-SPLITSVE-NEXT: bl memset +; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB28_4 +; CHECK1024-SPLITSVE-NEXT: // %bb.3: // %entry +; CHECK1024-SPLITSVE-NEXT: smstart sm +; CHECK1024-SPLITSVE-NEXT: .LBB28_4: // %entry +; CHECK1024-SPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-SPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #16 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z8 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z9 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z10 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z11 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z12 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z13 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z14 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z15 +; CHECK1024-SPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #2 +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa wsp, 64 +; CHECK1024-SPLITSVE-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w19 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w26 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w27 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w28 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore vg +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w30 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w29 +; CHECK1024-SPLITSVE-NEXT: ret entry: tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 %call = call ptr @memset(ptr noundef nonnull %P1, i32 noundef 45, i32 noundef 37) @@ -2505,138 +2588,267 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8 ; CHECK64-NEXT: .cfi_restore w29 ; CHECK64-NEXT: ret ; -; CHECK1024-LABEL: svecc_alloca_call: -; CHECK1024: // %bb.0: // %entry -; CHECK1024-NEXT: sub sp, sp, #1088 -; CHECK1024-NEXT: .cfi_def_cfa_offset 1088 -; CHECK1024-NEXT: cntd x9 -; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill -; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill -; CHECK1024-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill -; CHECK1024-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill -; CHECK1024-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill -; CHECK1024-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill -; CHECK1024-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill -; CHECK1024-NEXT: add x29, sp, #1024 -; CHECK1024-NEXT: .cfi_def_cfa w29, 64 -; CHECK1024-NEXT: .cfi_offset w19, -16 -; CHECK1024-NEXT: .cfi_offset w26, -24 -; CHECK1024-NEXT: .cfi_offset w27, -32 -; CHECK1024-NEXT: .cfi_offset w28, -40 -; CHECK1024-NEXT: .cfi_offset vg, -48 -; CHECK1024-NEXT: .cfi_offset w30, -56 -; CHECK1024-NEXT: .cfi_offset w29, -64 -; CHECK1024-NEXT: addvl sp, sp, #-18 -; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill -; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill -; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088 -; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088 -; CHECK1024-NEXT: sub sp, sp, #1072 -; CHECK1024-NEXT: bl __arm_sme_state -; CHECK1024-NEXT: mov x19, x0 -; CHECK1024-NEXT: //APP -; CHECK1024-NEXT: //NO_APP -; CHECK1024-NEXT: tbz w19, #0, .LBB29_2 -; CHECK1024-NEXT: // %bb.1: // %entry -; CHECK1024-NEXT: smstop sm -; CHECK1024-NEXT: .LBB29_2: // %entry -; CHECK1024-NEXT: mov x0, sp -; CHECK1024-NEXT: mov w1, #45 // =0x2d -; CHECK1024-NEXT: mov w2, #37 // =0x25 -; CHECK1024-NEXT: bl memset -; CHECK1024-NEXT: tbz w19, #0, .LBB29_4 -; CHECK1024-NEXT: // %bb.3: // %entry -; CHECK1024-NEXT: smstart sm -; CHECK1024-NEXT: .LBB29_4: // %entry -; CHECK1024-NEXT: mov w0, #22647 // =0x5877 -; CHECK1024-NEXT: movk w0, #59491, lsl #16 -; CHECK1024-NEXT: add sp, sp, #1072 -; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload -; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload -; CHECK1024-NEXT: addvl sp, sp, #18 -; CHECK1024-NEXT: .cfi_restore z8 -; CHECK1024-NEXT: .cfi_restore z9 -; CHECK1024-NEXT: .cfi_restore z10 -; CHECK1024-NEXT: .cfi_restore z11 -; CHECK1024-NEXT: .cfi_restore z12 -; CHECK1024-NEXT: .cfi_restore z13 -; CHECK1024-NEXT: .cfi_restore z14 -; CHECK1024-NEXT: .cfi_restore z15 -; CHECK1024-NEXT: .cfi_def_cfa wsp, 1088 -; CHECK1024-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload -; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload -; CHECK1024-NEXT: add sp, sp, #1088 -; CHECK1024-NEXT: .cfi_def_cfa_offset 0 -; CHECK1024-NEXT: .cfi_restore w19 -; CHECK1024-NEXT: .cfi_restore w26 -; CHECK1024-NEXT: .cfi_restore w27 -; CHECK1024-NEXT: .cfi_restore w28 -; CHECK1024-NEXT: .cfi_restore vg -; CHECK1024-NEXT: .cfi_restore w30 -; CHECK1024-NEXT: .cfi_restore w29 -; CHECK1024-NEXT: ret +; CHECK1024-NOSPLITSVE-LABEL: svecc_alloca_call: +; CHECK1024-NOSPLITSVE: // %bb.0: // %entry +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 1088 +; CHECK1024-NOSPLITSVE-NEXT: cntd x9 +; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: add x29, sp, #1024 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w19, -16 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w26, -24 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w27, -32 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w28, -40 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset vg, -48 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-18 +; CHECK1024-NOSPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088 +; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1072 +; CHECK1024-NOSPLITSVE-NEXT: bl __arm_sme_state +; CHECK1024-NOSPLITSVE-NEXT: mov x19, x0 +; CHECK1024-NOSPLITSVE-NEXT: //APP +; CHECK1024-NOSPLITSVE-NEXT: //NO_APP +; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB29_2 +; CHECK1024-NOSPLITSVE-NEXT: // %bb.1: // %entry +; CHECK1024-NOSPLITSVE-NEXT: smstop sm +; CHECK1024-NOSPLITSVE-NEXT: .LBB29_2: // %entry +; CHECK1024-NOSPLITSVE-NEXT: mov x0, sp +; CHECK1024-NOSPLITSVE-NEXT: mov w1, #45 // =0x2d +; CHECK1024-NOSPLITSVE-NEXT: mov w2, #37 // =0x25 +; CHECK1024-NOSPLITSVE-NEXT: bl memset +; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB29_4 +; CHECK1024-NOSPLITSVE-NEXT: // %bb.3: // %entry +; CHECK1024-NOSPLITSVE-NEXT: smstart sm +; CHECK1024-NOSPLITSVE-NEXT: .LBB29_4: // %entry +; CHECK1024-NOSPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-NOSPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1072 +; CHECK1024-NOSPLITSVE-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #18 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z8 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z9 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z10 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z11 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z12 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z13 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z14 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z15 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa wsp, 1088 +; CHECK1024-NOSPLITSVE-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload +; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1088 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w19 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w26 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w27 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w28 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore vg +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w30 +; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w29 +; CHECK1024-NOSPLITSVE-NEXT: ret +; +; CHECK1024-SPLITSVE-LABEL: svecc_alloca_call: +; CHECK1024-SPLITSVE: // %bb.0: // %entry +; CHECK1024-SPLITSVE-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 64 +; CHECK1024-SPLITSVE-NEXT: cntd x9 +; CHECK1024-SPLITSVE-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: mov x29, sp +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa w29, 64 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w19, -8 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w26, -16 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w27, -24 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w28, -32 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset vg, -48 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w30, -56 +; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -64 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-2 +; CHECK1024-SPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-16 +; CHECK1024-SPLITSVE-NEXT: str z23, [sp] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088 +; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1072 +; CHECK1024-SPLITSVE-NEXT: bl __arm_sme_state +; CHECK1024-SPLITSVE-NEXT: mov x19, x0 +; CHECK1024-SPLITSVE-NEXT: //APP +; CHECK1024-SPLITSVE-NEXT: //NO_APP +; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB29_2 +; CHECK1024-SPLITSVE-NEXT: // %bb.1: // %entry +; CHECK1024-SPLITSVE-NEXT: smstop sm +; CHECK1024-SPLITSVE-NEXT: .LBB29_2: // %entry +; CHECK1024-SPLITSVE-NEXT: mov x0, sp +; CHECK1024-SPLITSVE-NEXT: mov w1, #45 // =0x2d +; CHECK1024-SPLITSVE-NEXT: mov w2, #37 // =0x25 +; CHECK1024-SPLITSVE-NEXT: bl memset +; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB29_4 +; CHECK1024-SPLITSVE-NEXT: // %bb.3: // %entry +; CHECK1024-SPLITSVE-NEXT: smstart sm +; CHECK1024-SPLITSVE-NEXT: .LBB29_4: // %entry +; CHECK1024-SPLITSVE-NEXT: mov w0, #22647 // =0x5877 +; CHECK1024-SPLITSVE-NEXT: movk w0, #59491, lsl #16 +; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1072 +; CHECK1024-SPLITSVE-NEXT: ldr z23, [sp] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024 +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #16 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z8 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z9 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z10 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z11 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z12 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z13 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z14 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore z15 +; CHECK1024-SPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #2 +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa wsp, 64 +; CHECK1024-SPLITSVE-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload +; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 0 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w19 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w26 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w27 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w28 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore vg +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w30 +; CHECK1024-SPLITSVE-NEXT: .cfi_restore w29 +; CHECK1024-SPLITSVE-NEXT: ret entry: tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2 diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll index 7bddd1d..cc63c7f 100644 --- a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll +++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll @@ -56,9 +56,9 @@ define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_many_svepred_arg(< ; CHECK: name: caller_with_many_svepred_arg ; CHECK: stack: ; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 2, alignment: 2, -; CHECK-NEXT: stack-id: scalable-vector +; CHECK-NEXT: stack-id: scalable-predicate-vector ; CHECK: - { id: 1, name: '', type: default, offset: 0, size: 2, alignment: 2, -; CHECK-NEXT: stack-id: scalable-vector +; CHECK-NEXT: stack-id: scalable-predicate-vector ; CHECK-DAG: STR_PXI %{{[0-9]+}}, %stack.0, 0 ; CHECK-DAG: STR_PXI %{{[0-9]+}}, %stack.1, 0 ; CHECK-DAG: [[BASE1:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0 @@ -90,7 +90,7 @@ define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_svepred_arg_1xv16i ; CHECK: name: caller_with_svepred_arg_1xv16i1_4xv16i1 ; CHECK: stack: ; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 2, alignment: 2, -; CHECK-NEXT: stack-id: scalable-vector, +; CHECK-NEXT: stack-id: scalable-predicate-vector, ; CHECK: [[PRED0:%[0-9]+]]:ppr = COPY $p0 ; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp ; CHECK: STR_PXI [[PRED0]], %stack.0, 0 :: (store (<vscale x 1 x s16>) into %stack.0) @@ -139,7 +139,7 @@ define [4 x <vscale x 16 x i1>] @caller_with_svepred_arg_4xv16i1_4xv16i1([4 x <v ; CHECK: name: caller_with_svepred_arg_4xv16i1_4xv16i1 ; CHECK: stack: ; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 2, -; CHECK-NEXT: stack-id: scalable-vector, +; CHECK-NEXT: stack-id: scalable-predicate-vector, ; CHECK: [[PRED3:%[0-9]+]]:ppr = COPY $p3 ; CHECK: [[PRED2:%[0-9]+]]:ppr = COPY $p2 ; CHECK: [[PRED1:%[0-9]+]]:ppr = COPY $p1 @@ -200,7 +200,7 @@ define [2 x <vscale x 32 x i1>] @caller_with_svepred_arg_2xv32i1_1xv16i1([2 x <v ; CHECK: name: caller_with_svepred_arg_2xv32i1_1xv16i1 ; CHECK: stack: ; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 2, -; CHECK-NEXT: stack-id: scalable-vector, +; CHECK-NEXT: stack-id: scalable-predicate-vector, ; CHECK: [[PRED3:%[0-9]+]]:ppr = COPY $p3 ; CHECK: [[PRED2:%[0-9]+]]:ppr = COPY $p2 ; CHECK: [[PRED1:%[0-9]+]]:ppr = COPY $p1 diff --git a/llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll b/llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll new file mode 100644 index 0000000..584753b --- /dev/null +++ b/llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll @@ -0,0 +1,2854 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc -mattr=+sve < %s | FileCheck %s + +target triple = "aarch64-unknown-linux-gnu" + +define void @sve_load_store_nxv1i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv1i8: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: st1b { z0.b }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 1 x i8>, ptr %a + store <vscale x 1 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv2i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0] +; CHECK-NEXT: st1b { z0.d }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 2 x i8>, ptr %a + store <vscale x 2 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv3i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv3i8: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #3 // =0x3 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0] +; CHECK-NEXT: st1b { z0.s }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 3 x i8>, ptr %a + store <vscale x 3 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv4i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0] +; CHECK-NEXT: st1b { z0.s }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 4 x i8>, ptr %a + store <vscale x 4 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv5i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv5i8: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #5 // =0x5 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: st1b { z0.h }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 5 x i8>, ptr %a + store <vscale x 5 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv6i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv6i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: ld1b { z1.s }, p1/z, [x0] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: st1b { z1.s }, p1, [x1] +; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h +; CHECK-NEXT: uunpkhi z0.s, z0.h +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: st1b { z0.d }, p0, [x1, #2, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 6 x i8>, ptr %a + store <vscale x 6 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv7i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv7i8: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #7 // =0x7 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: st1b { z0.h }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 7 x i8>, ptr %a + store <vscale x 7 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv8i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0] +; CHECK-NEXT: st1b { z0.h }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 8 x i8>, ptr %a + store <vscale x 8 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv9i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv9i8: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #9 // =0x9 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: st1b { z0.b }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 9 x i8>, ptr %a + store <vscale x 9 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv10i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv10i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, #4, mul vl] +; CHECK-NEXT: ld1b { z1.h }, p1/z, [x0] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b +; CHECK-NEXT: uunpkhi z1.h, z0.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: st1b { z0.h }, p1, [x1] +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: st1b { z1.d }, p0, [x1, #4, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 10 x i8>, ptr %a + store <vscale x 10 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv11i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv11i8: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #11 // =0xb +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: st1b { z0.b }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 11 x i8>, ptr %a + store <vscale x 11 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv12i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv12i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: ld1b { z1.h }, p1/z, [x0] +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b +; CHECK-NEXT: uunpkhi z1.h, z0.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: st1b { z0.h }, p1, [x1] +; CHECK-NEXT: st1b { z1.s }, p0, [x1, #2, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 12 x i8>, ptr %a + store <vscale x 12 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv13i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv13i8: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #13 // =0xd +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: st1b { z0.b }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 13 x i8>, ptr %a + store <vscale x 13 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv14i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv14i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, #6, mul vl] +; CHECK-NEXT: ptrue p2.h +; CHECK-NEXT: ld1b { z1.s }, p1/z, [x0, #2, mul vl] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h +; CHECK-NEXT: ld1b { z1.h }, p2/z, [x0] +; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b +; CHECK-NEXT: uunpkhi z1.h, z0.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: st1b { z0.h }, p2, [x1] +; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: st1b { z1.s }, p1, [x1, #2, mul vl] +; CHECK-NEXT: st1b { z2.d }, p0, [x1, #6, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 14 x i8>, ptr %a + store <vscale x 14 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv15i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv15i8: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #15 // =0xf +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0] +; CHECK-NEXT: st1b { z0.b }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 15 x i8>, ptr %a + store <vscale x 15 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv16i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: str z0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 16 x i8>, ptr %a + store <vscale x 16 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv17i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv17i8: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w10, #17 // =0x11 +; CHECK-NEXT: lsr x9, x8, #4 +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: whilelo p0.b, x8, x9 +; CHECK-NEXT: whilelo p1.b, xzr, x9 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0] +; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl] +; CHECK-NEXT: st1b { z1.b }, p1, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 17 x i8>, ptr %a + store <vscale x 17 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv18i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv18i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x8] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpklo z1.s, z0.h +; CHECK-NEXT: uunpkhi z0.s, z0.h +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEXT: uzp1 z1.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: uzp1 z1.b, z0.b, z0.b +; CHECK-NEXT: uunpkhi z1.h, z1.b +; CHECK-NEXT: uunpklo z2.s, z1.h +; CHECK-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEXT: uzp1 z2.s, z0.s, z2.s +; CHECK-NEXT: uzp1 z1.h, z2.h, z1.h +; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b +; CHECK-NEXT: uunpkhi z1.h, z1.b +; CHECK-NEXT: uunpklo z2.s, z1.h +; CHECK-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: uzp1 z2.s, z2.s, z0.s +; CHECK-NEXT: uzp1 z1.h, z2.h, z1.h +; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b +; CHECK-NEXT: uunpkhi z1.h, z1.b +; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpkhi z2.d, z2.s +; CHECK-NEXT: uzp1 z2.s, z0.s, z2.s +; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h +; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b +; CHECK-NEXT: uunpkhi z1.h, z1.b +; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: uzp1 z2.s, z2.s, z0.s +; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: ldr z1, [x0] +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: st1b { z0.d }, p0, [x1, x8] +; CHECK-NEXT: str z1, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 18 x i8>, ptr %a + store <vscale x 18 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv19i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv19i8: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w10, #19 // =0x13 +; CHECK-NEXT: lsr x9, x8, #4 +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: whilelo p0.b, x8, x9 +; CHECK-NEXT: whilelo p1.b, xzr, x9 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0] +; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl] +; CHECK-NEXT: st1b { z1.b }, p1, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 19 x i8>, ptr %a + store <vscale x 19 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv20i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv20i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ld1b { z1.s }, p0/z, [x0, #4, mul vl] +; CHECK-NEXT: str z0, [x1] +; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z1.b, z0.b, z0.b +; CHECK-NEXT: uunpkhi z1.h, z1.b +; CHECK-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEXT: uzp1 z1.h, z0.h, z1.h +; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b +; CHECK-NEXT: uunpkhi z1.h, z1.b +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: st1b { z0.s }, p0, [x1, #4, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 20 x i8>, ptr %a + store <vscale x 20 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv21i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv21i8: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w10, #21 // =0x15 +; CHECK-NEXT: lsr x9, x8, #4 +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: whilelo p0.b, x8, x9 +; CHECK-NEXT: whilelo p1.b, xzr, x9 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0] +; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl] +; CHECK-NEXT: st1b { z1.b }, p1, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 21 x i8>, ptr %a + store <vscale x 21 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv22i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv22i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: cntw x8, all, mul #5 +; CHECK-NEXT: ldr z2, [x0] +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, #4, mul vl] +; CHECK-NEXT: ld1b { z1.d }, p1/z, [x0, x8] +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uzp1 z1.s, z1.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: uzp1 z1.b, z0.b, z0.b +; CHECK-NEXT: uunpkhi z1.h, z1.b +; CHECK-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEXT: uzp1 z1.h, z0.h, z1.h +; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b +; CHECK-NEXT: uunpkhi z1.h, z1.b +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uzp1 z1.h, z1.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: st1b { z1.d }, p1, [x1, x8] +; CHECK-NEXT: st1b { z0.s }, p0, [x1, #4, mul vl] +; CHECK-NEXT: str z2, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 22 x i8>, ptr %a + store <vscale x 22 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv23i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv23i8: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w10, #23 // =0x17 +; CHECK-NEXT: lsr x9, x8, #4 +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: whilelo p0.b, x8, x9 +; CHECK-NEXT: whilelo p1.b, xzr, x9 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0] +; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl] +; CHECK-NEXT: st1b { z1.b }, p1, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 23 x i8>, ptr %a + store <vscale x 23 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv24i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv24i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: str z0, [x1] +; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: st1b { z0.h }, p0, [x1, #2, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 24 x i8>, ptr %a + store <vscale x 24 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv25i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv25i8: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w10, #25 // =0x19 +; CHECK-NEXT: lsr x9, x8, #4 +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: whilelo p0.b, x8, x9 +; CHECK-NEXT: whilelo p1.b, xzr, x9 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0] +; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl] +; CHECK-NEXT: st1b { z1.b }, p1, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 25 x i8>, ptr %a + store <vscale x 25 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv26i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv26i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: cnth x8, all, mul #3 +; CHECK-NEXT: ldr z2, [x0] +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x8] +; CHECK-NEXT: ld1b { z1.h }, p1/z, [x0, #2, mul vl] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b +; CHECK-NEXT: uunpkhi z1.h, z0.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: st1b { z1.d }, p0, [x1, x8] +; CHECK-NEXT: st1b { z0.h }, p1, [x1, #2, mul vl] +; CHECK-NEXT: str z2, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 26 x i8>, ptr %a + store <vscale x 26 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv27i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv27i8: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w10, #27 // =0x1b +; CHECK-NEXT: lsr x9, x8, #4 +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: whilelo p0.b, x8, x9 +; CHECK-NEXT: whilelo p1.b, xzr, x9 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0] +; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl] +; CHECK-NEXT: st1b { z1.b }, p1, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 27 x i8>, ptr %a + store <vscale x 27 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv28i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv28i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr z2, [x0] +; CHECK-NEXT: ptrue p1.h +; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, #6, mul vl] +; CHECK-NEXT: ld1b { z1.h }, p1/z, [x0, #2, mul vl] +; CHECK-NEXT: str z2, [x1] +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b +; CHECK-NEXT: uunpkhi z1.h, z0.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: st1b { z0.h }, p1, [x1, #2, mul vl] +; CHECK-NEXT: st1b { z1.s }, p0, [x1, #6, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 28 x i8>, ptr %a + store <vscale x 28 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv29i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv29i8: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w10, #29 // =0x1d +; CHECK-NEXT: lsr x9, x8, #4 +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: whilelo p0.b, x8, x9 +; CHECK-NEXT: whilelo p1.b, xzr, x9 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0] +; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl] +; CHECK-NEXT: st1b { z1.b }, p1, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 29 x i8>, ptr %a + store <vscale x 29 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv30i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv30i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: cntw x8, all, mul #7 +; CHECK-NEXT: ldr z3, [x0] +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x8] +; CHECK-NEXT: ptrue p2.h +; CHECK-NEXT: ld1b { z1.s }, p1/z, [x0, #6, mul vl] +; CHECK-NEXT: ld1b { z2.h }, p2/z, [x0, #2, mul vl] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h +; CHECK-NEXT: uzp1 z0.b, z2.b, z0.b +; CHECK-NEXT: uunpkhi z1.h, z0.b +; CHECK-NEXT: uunpklo z0.h, z0.b +; CHECK-NEXT: uunpkhi z2.s, z1.h +; CHECK-NEXT: uunpklo z1.s, z1.h +; CHECK-NEXT: uunpklo z2.d, z2.s +; CHECK-NEXT: st1b { z2.d }, p0, [x1, x8] +; CHECK-NEXT: st1b { z0.h }, p2, [x1, #2, mul vl] +; CHECK-NEXT: st1b { z1.s }, p1, [x1, #6, mul vl] +; CHECK-NEXT: str z3, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 30 x i8>, ptr %a + store <vscale x 30 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv31i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv31i8: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w10, #31 // =0x1f +; CHECK-NEXT: lsr x9, x8, #4 +; CHECK-NEXT: mul x9, x9, x10 +; CHECK-NEXT: whilelo p0.b, x8, x9 +; CHECK-NEXT: whilelo p1.b, xzr, x9 +; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0] +; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl] +; CHECK-NEXT: st1b { z1.b }, p1, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 31 x i8>, ptr %a + store <vscale x 31 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv32i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv32i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0, #1, mul vl] +; CHECK-NEXT: ldr z1, [x0] +; CHECK-NEXT: str z0, [x1, #1, mul vl] +; CHECK-NEXT: str z1, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 32 x i8>, ptr %a + store <vscale x 32 x i8> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv1i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 1 x i16>, ptr %a + store <vscale x 1 x i16> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv2i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.d }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 2 x i16>, ptr %a + store <vscale x 2 x i16> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv3i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv3i16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #3 // =0x3 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.s }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 3 x i16>, ptr %a + store <vscale x 3 x i16> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv4i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.s }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 4 x i16>, ptr %a + store <vscale x 4 x i16> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv5i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv5i16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #5 // =0x5 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 5 x i16>, ptr %a + store <vscale x 5 x i16> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv6i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv6i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: st1h { z0.s }, p1, [x1] +; CHECK-NEXT: st1h { z1.d }, p0, [x1, #2, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 6 x i16>, ptr %a + store <vscale x 6 x i16> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv7i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv7i16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #7 // =0x7 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 7 x i16>, ptr %a + store <vscale x 7 x i16> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv8i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: str z0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 8 x i16>, ptr %a + store <vscale x 8 x i16> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv9i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv9i16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #9 // =0x9 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl] +; CHECK-NEXT: st1h { z1.h }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 9 x i16>, ptr %a + store <vscale x 9 x i16> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv10i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv10i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, #4, mul vl] +; CHECK-NEXT: str z0, [x1] +; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z1.h, z0.h, z0.h +; CHECK-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEXT: uunpkhi z1.d, z1.s +; CHECK-NEXT: uzp1 z1.s, z0.s, z1.s +; CHECK-NEXT: uzp1 z1.h, z0.h, z1.h +; CHECK-NEXT: uunpkhi z1.s, z1.h +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: uzp1 z1.s, z1.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: st1h { z0.d }, p0, [x1, #4, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 10 x i16>, ptr %a + store <vscale x 10 x i16> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv11i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv11i16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #11 // =0xb +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl] +; CHECK-NEXT: st1h { z1.h }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 11 x i16>, ptr %a + store <vscale x 11 x i16> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv12i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv12i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: str z0, [x1] +; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: st1h { z0.s }, p0, [x1, #2, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 12 x i16>, ptr %a + store <vscale x 12 x i16> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv13i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv13i16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #13 // =0xd +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl] +; CHECK-NEXT: st1h { z1.h }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 13 x i16>, ptr %a + store <vscale x 13 x i16> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv14i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv14i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr z2, [x0] +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #6, mul vl] +; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0, #2, mul vl] +; CHECK-NEXT: str z2, [x1] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: st1h { z0.s }, p1, [x1, #2, mul vl] +; CHECK-NEXT: st1h { z1.d }, p0, [x1, #6, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 14 x i16>, ptr %a + store <vscale x 14 x i16> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv15i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv15i16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #15 // =0xf +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl] +; CHECK-NEXT: st1h { z1.h }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 15 x i16>, ptr %a + store <vscale x 15 x i16> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv16i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv16i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0, #1, mul vl] +; CHECK-NEXT: ldr z1, [x0] +; CHECK-NEXT: str z0, [x1, #1, mul vl] +; CHECK-NEXT: str z1, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 16 x i16>, ptr %a + store <vscale x 16 x i16> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv1i32(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: st1w { z0.s }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 1 x i32>, ptr %a + store <vscale x 1 x i32> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv2i32(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] +; CHECK-NEXT: st1w { z0.d }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 2 x i32>, ptr %a + store <vscale x 2 x i32> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv3i32(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv3i32: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #3 // =0x3 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: st1w { z0.s }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 3 x i32>, ptr %a + store <vscale x 3 x i32> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv4i32(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: str z0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 4 x i32>, ptr %a + store <vscale x 4 x i32> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv5i32(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv5i32: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #5 // =0x5 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: st1w { z0.s }, p1, [x1, #1, mul vl] +; CHECK-NEXT: st1w { z1.s }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 5 x i32>, ptr %a + store <vscale x 5 x i32> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv6i32(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv6i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: str z0, [x1] +; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: st1w { z0.d }, p0, [x1, #2, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 6 x i32>, ptr %a + store <vscale x 6 x i32> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv7i32(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv7i32: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #7 // =0x7 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: st1w { z0.s }, p1, [x1, #1, mul vl] +; CHECK-NEXT: st1w { z1.s }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 7 x i32>, ptr %a + store <vscale x 7 x i32> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv8i32(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv8i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0, #1, mul vl] +; CHECK-NEXT: ldr z1, [x0] +; CHECK-NEXT: str z0, [x1, #1, mul vl] +; CHECK-NEXT: str z1, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 8 x i32>, ptr %a + store <vscale x 8 x i32> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv1i64(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv1i64: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: whilelo p0.d, xzr, x8 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: st1d { z0.d }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 1 x i64>, ptr %a + store <vscale x 1 x i64> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv2i64(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv2i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: str z0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 2 x i64>, ptr %a + store <vscale x 2 x i64> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv3i64(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv3i64: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #3 // =0x3 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: st1d { z0.d }, p1, [x1, #1, mul vl] +; CHECK-NEXT: st1d { z1.d }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 3 x i64>, ptr %a + store <vscale x 3 x i64> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv4i64(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv4i64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0, #1, mul vl] +; CHECK-NEXT: ldr z1, [x0] +; CHECK-NEXT: str z0, [x1, #1, mul vl] +; CHECK-NEXT: str z1, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 4 x i64>, ptr %a + store <vscale x 4 x i64> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv1f16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv1f16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: whilelo p0.d, xzr, x8 +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.d }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 1 x half>, ptr %a + store <vscale x 1 x half> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv2f16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv2f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.d }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 2 x half>, ptr %a + store <vscale x 2 x half> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv3f16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv3f16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #3 // =0x3 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.s }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 3 x half>, ptr %a + store <vscale x 3 x half> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv4f16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv4f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.s }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 4 x half>, ptr %a + store <vscale x 4 x half> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv5f16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv5f16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #5 // =0x5 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 5 x half>, ptr %a + store <vscale x 5 x half> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv6f16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv6f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: st1h { z1.s }, p1, [x1] +; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h +; CHECK-NEXT: uunpkhi z0.s, z0.h +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: st1h { z0.d }, p0, [x1, #2, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 6 x half>, ptr %a + store <vscale x 6 x half> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv7f16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv7f16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #7 // =0x7 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 7 x half>, ptr %a + store <vscale x 7 x half> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv8f16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv8f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: str z0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 8 x half>, ptr %a + store <vscale x 8 x half> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv9f16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv9f16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #9 // =0x9 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl] +; CHECK-NEXT: st1h { z1.h }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 9 x half>, ptr %a + store <vscale x 9 x half> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv10f16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv10f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, #4, mul vl] +; CHECK-NEXT: str z0, [x1] +; CHECK-NEXT: st1h { z1.d }, p0, [x1, #4, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 10 x half>, ptr %a + store <vscale x 10 x half> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv11f16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv11f16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #11 // =0xb +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl] +; CHECK-NEXT: st1h { z1.h }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 11 x half>, ptr %a + store <vscale x 11 x half> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv12f16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv12f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: str z0, [x1] +; CHECK-NEXT: st1h { z1.s }, p0, [x1, #2, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 12 x half>, ptr %a + store <vscale x 12 x half> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv13f16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv13f16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #13 // =0xd +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl] +; CHECK-NEXT: st1h { z1.h }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 13 x half>, ptr %a + store <vscale x 13 x half> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv14f16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv14f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr z2, [x0] +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #6, mul vl] +; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0, #2, mul vl] +; CHECK-NEXT: str z2, [x1] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: st1h { z1.s }, p1, [x1, #2, mul vl] +; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h +; CHECK-NEXT: uunpkhi z0.s, z0.h +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: st1h { z0.d }, p0, [x1, #6, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 14 x half>, ptr %a + store <vscale x 14 x half> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv15f16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv15f16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #15 // =0xf +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl] +; CHECK-NEXT: st1h { z1.h }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 15 x half>, ptr %a + store <vscale x 15 x half> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv16f16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv16f16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0, #1, mul vl] +; CHECK-NEXT: ldr z1, [x0] +; CHECK-NEXT: str z0, [x1, #1, mul vl] +; CHECK-NEXT: str z1, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 16 x half>, ptr %a + store <vscale x 16 x half> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv1f32(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv1f32: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: whilelo p0.d, xzr, x8 +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] +; CHECK-NEXT: st1w { z0.d }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 1 x float>, ptr %a + store <vscale x 1 x float> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv2f32(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv2f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0] +; CHECK-NEXT: st1w { z0.d }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 2 x float>, ptr %a + store <vscale x 2 x float> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv3f32(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv3f32: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #3 // =0x3 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0] +; CHECK-NEXT: st1w { z0.s }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 3 x float>, ptr %a + store <vscale x 3 x float> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv4f32(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv4f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: str z0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 4 x float>, ptr %a + store <vscale x 4 x float> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv5f32(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv5f32: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #5 // =0x5 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: st1w { z0.s }, p1, [x1, #1, mul vl] +; CHECK-NEXT: st1w { z1.s }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 5 x float>, ptr %a + store <vscale x 5 x float> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv6f32(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv6f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: str z0, [x1] +; CHECK-NEXT: st1w { z1.d }, p0, [x1, #2, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 6 x float>, ptr %a + store <vscale x 6 x float> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv7f32(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv7f32: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #7 // =0x7 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0] +; CHECK-NEXT: st1w { z0.s }, p1, [x1, #1, mul vl] +; CHECK-NEXT: st1w { z1.s }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 7 x float>, ptr %a + store <vscale x 7 x float> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv8f32(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv8f32: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0, #1, mul vl] +; CHECK-NEXT: ldr z1, [x0] +; CHECK-NEXT: str z0, [x1, #1, mul vl] +; CHECK-NEXT: str z1, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 8 x float>, ptr %a + store <vscale x 8 x float> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv1f64(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv1f64: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: whilelo p0.d, xzr, x8 +; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0] +; CHECK-NEXT: st1d { z0.d }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 1 x double>, ptr %a + store <vscale x 1 x double> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv2f64(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv2f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: str z0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 2 x double>, ptr %a + store <vscale x 2 x double> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv3f64(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv3f64: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #3 // =0x3 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0] +; CHECK-NEXT: st1d { z0.d }, p1, [x1, #1, mul vl] +; CHECK-NEXT: st1d { z1.d }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 3 x double>, ptr %a + store <vscale x 3 x double> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv4f64(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv4f64: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0, #1, mul vl] +; CHECK-NEXT: ldr z1, [x0] +; CHECK-NEXT: str z0, [x1, #1, mul vl] +; CHECK-NEXT: str z1, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 4 x double>, ptr %a + store <vscale x 4 x double> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv1bf16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv1bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: whilelo p0.d, xzr, x8 +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.d }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 1 x bfloat>, ptr %a + store <vscale x 1 x bfloat> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv2bf16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv2bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.d }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 2 x bfloat>, ptr %a + store <vscale x 2 x bfloat> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv3bf16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv3bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #3 // =0x3 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.s }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 3 x bfloat>, ptr %a + store <vscale x 3 x bfloat> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv4bf16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv4bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.s }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 4 x bfloat>, ptr %a + store <vscale x 4 x bfloat> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv5bf16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv5bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #5 // =0x5 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 5 x bfloat>, ptr %a + store <vscale x 5 x bfloat> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv6bf16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv6bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: st1h { z1.s }, p1, [x1] +; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h +; CHECK-NEXT: uunpkhi z0.s, z0.h +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: st1h { z0.d }, p0, [x1, #2, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 6 x bfloat>, ptr %a + store <vscale x 6 x bfloat> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv7bf16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv7bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #7 // =0x7 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 7 x bfloat>, ptr %a + store <vscale x 7 x bfloat> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv8bf16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv8bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: str z0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 8 x bfloat>, ptr %a + store <vscale x 8 x bfloat> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv9bf16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv9bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #9 // =0x9 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl] +; CHECK-NEXT: st1h { z1.h }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 9 x bfloat>, ptr %a + store <vscale x 9 x bfloat> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv10bf16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv10bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, #4, mul vl] +; CHECK-NEXT: str z0, [x1] +; CHECK-NEXT: st1h { z1.d }, p0, [x1, #4, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 10 x bfloat>, ptr %a + store <vscale x 10 x bfloat> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv11bf16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv11bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #11 // =0xb +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl] +; CHECK-NEXT: st1h { z1.h }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 11 x bfloat>, ptr %a + store <vscale x 11 x bfloat> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv12bf16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv12bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ldr z0, [x0] +; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0, #2, mul vl] +; CHECK-NEXT: str z0, [x1] +; CHECK-NEXT: st1h { z1.s }, p0, [x1, #2, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 12 x bfloat>, ptr %a + store <vscale x 12 x bfloat> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv13bf16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv13bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #13 // =0xd +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl] +; CHECK-NEXT: st1h { z1.h }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 13 x bfloat>, ptr %a + store <vscale x 13 x bfloat> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv14bf16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv14bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ldr z2, [x0] +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #6, mul vl] +; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0, #2, mul vl] +; CHECK-NEXT: str z2, [x1] +; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s +; CHECK-NEXT: st1h { z1.s }, p1, [x1, #2, mul vl] +; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h +; CHECK-NEXT: uunpkhi z0.s, z0.h +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: st1h { z0.d }, p0, [x1, #6, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 14 x bfloat>, ptr %a + store <vscale x 14 x bfloat> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv15bf16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv15bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #15 // =0xf +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl] +; CHECK-NEXT: st1h { z1.h }, p0, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 15 x bfloat>, ptr %a + store <vscale x 15 x bfloat> %c, ptr %b + ret void +} + +define void @sve_load_store_nxv16bf16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_load_store_nxv16bf16: +; CHECK: // %bb.0: +; CHECK-NEXT: ldr z0, [x0, #1, mul vl] +; CHECK-NEXT: ldr z1, [x0] +; CHECK-NEXT: str z0, [x1, #1, mul vl] +; CHECK-NEXT: str z1, [x1] +; CHECK-NEXT: ret + %c = load <vscale x 16 x bfloat>, ptr %a + store <vscale x 16 x bfloat> %c, ptr %b + ret void +} + +define <vscale x 1 x i16> @sve_sextload_nxv1i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv1i8: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 1 x i8>, ptr %a + %c.sext = sext <vscale x 1 x i8> %c to <vscale x 1 x i16> + ret <vscale x 1 x i16> %c.sext +} + +define <vscale x 2 x i16> @sve_sextload_nxv2i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 2 x i8>, ptr %a + %c.sext = sext <vscale x 2 x i8> %c to <vscale x 2 x i16> + ret <vscale x 2 x i16> %c.sext +} + +define <vscale x 3 x i16> @sve_sextload_nxv3i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv3i8: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #3 // =0x3 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 3 x i8>, ptr %a + %c.sext = sext <vscale x 3 x i8> %c to <vscale x 3 x i16> + ret <vscale x 3 x i16> %c.sext +} + +define <vscale x 4 x i16> @sve_sextload_nxv4i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 4 x i8>, ptr %a + %c.sext = sext <vscale x 4 x i8> %c to <vscale x 4 x i16> + ret <vscale x 4 x i16> %c.sext +} + +define <vscale x 5 x i16> @sve_sextload_nxv5i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv5i8: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #5 // =0x5 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 5 x i8>, ptr %a + %c.sext = sext <vscale x 5 x i8> %c to <vscale x 5 x i16> + ret <vscale x 5 x i16> %c.sext +} + +define <vscale x 6 x i16> @sve_sextload_nxv6i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv6i8: +; CHECK: // %bb.0: +; CHECK-NEXT: cntd x8, all, mul #3 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 6 x i8>, ptr %a + %c.sext = sext <vscale x 6 x i8> %c to <vscale x 6 x i16> + ret <vscale x 6 x i16> %c.sext +} + +define <vscale x 7 x i16> @sve_sextload_nxv7i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv7i8: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #7 // =0x7 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 7 x i8>, ptr %a + %c.sext = sext <vscale x 7 x i8> %c to <vscale x 7 x i16> + ret <vscale x 7 x i16> %c.sext +} + +define <vscale x 8 x i16> @sve_sextload_nxv8i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 8 x i8>, ptr %a + %c.sext = sext <vscale x 8 x i8> %c to <vscale x 8 x i16> + ret <vscale x 8 x i16> %c.sext +} + +define <vscale x 9 x i16> @sve_sextload_nxv9i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv9i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #9 // =0x9 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl] +; CHECK-NEXT: st1h { z1.h }, p0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %c = load <vscale x 9 x i8>, ptr %a + %c.sext = sext <vscale x 9 x i8> %c to <vscale x 9 x i16> + ret <vscale x 9 x i16> %c.sext +} + +define <vscale x 10 x i16> @sve_sextload_nxv10i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv10i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: cntd x8, all, mul #5 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: str z1, [sp] +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: st1h { z0.d }, p0, [sp, #4, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %c = load <vscale x 10 x i8>, ptr %a + %c.sext = sext <vscale x 10 x i8> %c to <vscale x 10 x i16> + ret <vscale x 10 x i16> %c.sext +} + +define <vscale x 11 x i16> @sve_sextload_nxv11i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv11i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #11 // =0xb +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl] +; CHECK-NEXT: st1h { z1.h }, p0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %c = load <vscale x 11 x i8>, ptr %a + %c.sext = sext <vscale x 11 x i8> %c to <vscale x 11 x i16> + ret <vscale x 11 x i16> %c.sext +} + +define <vscale x 12 x i16> @sve_sextload_nxv12i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv12i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: cntw x8, all, mul #3 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0] +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: str z1, [sp] +; CHECK-NEXT: st1h { z0.s }, p1, [sp, #2, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %c = load <vscale x 12 x i8>, ptr %a + %c.sext = sext <vscale x 12 x i8> %c to <vscale x 12 x i16> + ret <vscale x 12 x i16> %c.sext +} + +define <vscale x 13 x i16> @sve_sextload_nxv13i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv13i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #13 // =0xd +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl] +; CHECK-NEXT: st1h { z1.h }, p0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %c = load <vscale x 13 x i8>, ptr %a + %c.sext = sext <vscale x 13 x i8> %c to <vscale x 13 x i16> + ret <vscale x 13 x i16> %c.sext +} + +define <vscale x 14 x i16> @sve_sextload_nxv14i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv14i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: cntd x8, all, mul #7 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1sb { z2.h }, p0/z, [x0] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: str z2, [sp] +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: st1h { z0.s }, p1, [sp, #2, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: st1h { z1.d }, p0, [sp, #6, mul vl] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %c = load <vscale x 14 x i8>, ptr %a + %c.sext = sext <vscale x 14 x i8> %c to <vscale x 14 x i16> + ret <vscale x 14 x i16> %c.sext +} + +define <vscale x 15 x i16> @sve_sextload_nxv15i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv15i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #15 // =0xf +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl] +; CHECK-NEXT: st1h { z1.h }, p0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %c = load <vscale x 15 x i8>, ptr %a + %c.sext = sext <vscale x 15 x i8> %c to <vscale x 15 x i16> + ret <vscale x 15 x i16> %c.sext +} + +define <vscale x 16 x i16> @sve_sextload_nxv16i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0] +; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 16 x i8>, ptr %a + %c.sext = sext <vscale x 16 x i8> %c to <vscale x 16 x i16> + ret <vscale x 16 x i16> %c.sext +} + +define <vscale x 1 x i32> @sve_sextload_nxv1i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 1 x i16>, ptr %a + %c.sext = sext <vscale x 1 x i16> %c to <vscale x 1 x i32> + ret <vscale x 1 x i32> %c.sext +} + +define <vscale x 2 x i32> @sve_sextload_nxv2i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 2 x i16>, ptr %a + %c.sext = sext <vscale x 2 x i16> %c to <vscale x 2 x i32> + ret <vscale x 2 x i32> %c.sext +} + +define <vscale x 3 x i32> @sve_sextload_nxv3i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv3i16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #3 // =0x3 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 3 x i16>, ptr %a + %c.sext = sext <vscale x 3 x i16> %c to <vscale x 3 x i32> + ret <vscale x 3 x i32> %c.sext +} + +define <vscale x 4 x i32> @sve_sextload_nxv4i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 4 x i16>, ptr %a + %c.sext = sext <vscale x 4 x i16> %c to <vscale x 4 x i32> + ret <vscale x 4 x i32> %c.sext +} + +define <vscale x 5 x i32> @sve_sextload_nxv5i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv5i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #5 // =0x5 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0] +; CHECK-NEXT: st1w { z0.s }, p1, [sp, #1, mul vl] +; CHECK-NEXT: st1w { z1.s }, p0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %c = load <vscale x 5 x i16>, ptr %a + %c.sext = sext <vscale x 5 x i16> %c to <vscale x 5 x i32> + ret <vscale x 5 x i32> %c.sext +} + +define <vscale x 6 x i32> @sve_sextload_nxv6i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv6i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: cntd x8, all, mul #3 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0] +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: str z1, [sp] +; CHECK-NEXT: st1w { z0.d }, p1, [sp, #2, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %c = load <vscale x 6 x i16>, ptr %a + %c.sext = sext <vscale x 6 x i16> %c to <vscale x 6 x i32> + ret <vscale x 6 x i32> %c.sext +} + +define <vscale x 7 x i32> @sve_sextload_nxv7i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv7i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #7 // =0x7 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0] +; CHECK-NEXT: st1w { z0.s }, p1, [sp, #1, mul vl] +; CHECK-NEXT: st1w { z1.s }, p0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %c = load <vscale x 7 x i16>, ptr %a + %c.sext = sext <vscale x 7 x i16> %c to <vscale x 7 x i32> + ret <vscale x 7 x i32> %c.sext +} + +define <vscale x 8 x i32> @sve_sextload_nxv8i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 8 x i16>, ptr %a + %c.sext = sext <vscale x 8 x i16> %c to <vscale x 8 x i32> + ret <vscale x 8 x i32> %c.sext +} + +define <vscale x 1 x i64> @sve_sextload_nxv1i32(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: whilelo p0.d, xzr, x8 +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 1 x i32>, ptr %a + %c.sext = sext <vscale x 1 x i32> %c to <vscale x 1 x i64> + ret <vscale x 1 x i64> %c.sext +} + +define <vscale x 2 x i64> @sve_sextload_nxv2i32(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 2 x i32>, ptr %a + %c.sext = sext <vscale x 2 x i32> %c to <vscale x 2 x i64> + ret <vscale x 2 x i64> %c.sext +} + +define <vscale x 3 x i64> @sve_sextload_nxv3i32(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv3i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #3 // =0x3 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1sw { z0.d }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0] +; CHECK-NEXT: st1d { z0.d }, p1, [sp, #1, mul vl] +; CHECK-NEXT: st1d { z1.d }, p0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %c = load <vscale x 3 x i32>, ptr %a + %c.sext = sext <vscale x 3 x i32> %c to <vscale x 3 x i64> + ret <vscale x 3 x i64> %c.sext +} + +define <vscale x 4 x i64> @sve_sextload_nxv4i32(ptr %a, ptr %b) { +; CHECK-LABEL: sve_sextload_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0] +; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 4 x i32>, ptr %a + %c.sext = sext <vscale x 4 x i32> %c to <vscale x 4 x i64> + ret <vscale x 4 x i64> %c.sext +} + +define <vscale x 1 x i16> @sve_zextload_nxv1i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv1i8: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 1 x i8>, ptr %a + %c.zext = sext <vscale x 1 x i8> %c to <vscale x 1 x i16> + ret <vscale x 1 x i16> %c.zext +} + +define <vscale x 2 x i16> @sve_zextload_nxv2i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv2i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 2 x i8>, ptr %a + %c.zext = sext <vscale x 2 x i8> %c to <vscale x 2 x i16> + ret <vscale x 2 x i16> %c.zext +} + +define <vscale x 3 x i16> @sve_zextload_nxv3i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv3i8: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #3 // =0x3 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 3 x i8>, ptr %a + %c.zext = sext <vscale x 3 x i8> %c to <vscale x 3 x i16> + ret <vscale x 3 x i16> %c.zext +} + +define <vscale x 4 x i16> @sve_zextload_nxv4i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv4i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 4 x i8>, ptr %a + %c.zext = sext <vscale x 4 x i8> %c to <vscale x 4 x i16> + ret <vscale x 4 x i16> %c.zext +} + +define <vscale x 5 x i16> @sve_zextload_nxv5i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv5i8: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #5 // =0x5 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 5 x i8>, ptr %a + %c.zext = sext <vscale x 5 x i8> %c to <vscale x 5 x i16> + ret <vscale x 5 x i16> %c.zext +} + +define <vscale x 6 x i16> @sve_zextload_nxv6i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv6i8: +; CHECK: // %bb.0: +; CHECK-NEXT: cntd x8, all, mul #3 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 6 x i8>, ptr %a + %c.zext = sext <vscale x 6 x i8> %c to <vscale x 6 x i16> + ret <vscale x 6 x i16> %c.zext +} + +define <vscale x 7 x i16> @sve_zextload_nxv7i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv7i8: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #7 // =0x7 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 7 x i8>, ptr %a + %c.zext = sext <vscale x 7 x i8> %c to <vscale x 7 x i16> + ret <vscale x 7 x i16> %c.zext +} + +define <vscale x 8 x i16> @sve_zextload_nxv8i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv8i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 8 x i8>, ptr %a + %c.zext = sext <vscale x 8 x i8> %c to <vscale x 8 x i16> + ret <vscale x 8 x i16> %c.zext +} + +define <vscale x 9 x i16> @sve_zextload_nxv9i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv9i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #9 // =0x9 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl] +; CHECK-NEXT: st1h { z1.h }, p0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %c = load <vscale x 9 x i8>, ptr %a + %c.zext = sext <vscale x 9 x i8> %c to <vscale x 9 x i16> + ret <vscale x 9 x i16> %c.zext +} + +define <vscale x 10 x i16> @sve_zextload_nxv10i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv10i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: cntd x8, all, mul #5 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: str z1, [sp] +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: st1h { z0.d }, p0, [sp, #4, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %c = load <vscale x 10 x i8>, ptr %a + %c.zext = sext <vscale x 10 x i8> %c to <vscale x 10 x i16> + ret <vscale x 10 x i16> %c.zext +} + +define <vscale x 11 x i16> @sve_zextload_nxv11i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv11i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #11 // =0xb +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl] +; CHECK-NEXT: st1h { z1.h }, p0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %c = load <vscale x 11 x i8>, ptr %a + %c.zext = sext <vscale x 11 x i8> %c to <vscale x 11 x i16> + ret <vscale x 11 x i16> %c.zext +} + +define <vscale x 12 x i16> @sve_zextload_nxv12i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv12i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: cntw x8, all, mul #3 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0] +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: str z1, [sp] +; CHECK-NEXT: st1h { z0.s }, p1, [sp, #2, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %c = load <vscale x 12 x i8>, ptr %a + %c.zext = sext <vscale x 12 x i8> %c to <vscale x 12 x i16> + ret <vscale x 12 x i16> %c.zext +} + +define <vscale x 13 x i16> @sve_zextload_nxv13i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv13i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #13 // =0xd +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl] +; CHECK-NEXT: st1h { z1.h }, p0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %c = load <vscale x 13 x i8>, ptr %a + %c.zext = sext <vscale x 13 x i8> %c to <vscale x 13 x i16> + ret <vscale x 13 x i16> %c.zext +} + +define <vscale x 14 x i16> @sve_zextload_nxv14i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv14i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: cntd x8, all, mul #7 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ptrue p1.s +; CHECK-NEXT: ld1sb { z2.h }, p0/z, [x0] +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: uunpkhi z1.s, z0.h +; CHECK-NEXT: uunpklo z0.s, z0.h +; CHECK-NEXT: str z2, [sp] +; CHECK-NEXT: uunpklo z1.d, z1.s +; CHECK-NEXT: st1h { z0.s }, p1, [sp, #2, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: st1h { z1.d }, p0, [sp, #6, mul vl] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %c = load <vscale x 14 x i8>, ptr %a + %c.zext = sext <vscale x 14 x i8> %c to <vscale x 14 x i16> + ret <vscale x 14 x i16> %c.zext +} + +define <vscale x 15 x i16> @sve_zextload_nxv15i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv15i8: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #15 // =0xf +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.b, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0] +; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl] +; CHECK-NEXT: st1h { z1.h }, p0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %c = load <vscale x 15 x i8>, ptr %a + %c.zext = sext <vscale x 15 x i8> %c to <vscale x 15 x i16> + ret <vscale x 15 x i16> %c.zext +} + +define <vscale x 16 x i16> @sve_zextload_nxv16i8(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv16i8: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.h +; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0] +; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 16 x i8>, ptr %a + %c.zext = sext <vscale x 16 x i8> %c to <vscale x 16 x i16> + ret <vscale x 16 x i16> %c.zext +} + +define <vscale x 1 x i32> @sve_zextload_nxv1i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv1i16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 1 x i16>, ptr %a + %c.zext = sext <vscale x 1 x i16> %c to <vscale x 1 x i32> + ret <vscale x 1 x i32> %c.zext +} + +define <vscale x 2 x i32> @sve_zextload_nxv2i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv2i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 2 x i16>, ptr %a + %c.zext = sext <vscale x 2 x i16> %c to <vscale x 2 x i32> + ret <vscale x 2 x i32> %c.zext +} + +define <vscale x 3 x i32> @sve_zextload_nxv3i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv3i16: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #3 // =0x3 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 3 x i16>, ptr %a + %c.zext = sext <vscale x 3 x i16> %c to <vscale x 3 x i32> + ret <vscale x 3 x i32> %c.zext +} + +define <vscale x 4 x i32> @sve_zextload_nxv4i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv4i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 4 x i16>, ptr %a + %c.zext = sext <vscale x 4 x i16> %c to <vscale x 4 x i32> + ret <vscale x 4 x i32> %c.zext +} + +define <vscale x 5 x i32> @sve_zextload_nxv5i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv5i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #5 // =0x5 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0] +; CHECK-NEXT: st1w { z0.s }, p1, [sp, #1, mul vl] +; CHECK-NEXT: st1w { z1.s }, p0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %c = load <vscale x 5 x i16>, ptr %a + %c.zext = sext <vscale x 5 x i16> %c to <vscale x 5 x i32> + ret <vscale x 5 x i32> %c.zext +} + +define <vscale x 6 x i32> @sve_zextload_nxv6i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv6i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: cntd x8, all, mul #3 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ptrue p1.d +; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0] +; CHECK-NEXT: uunpklo z0.d, z0.s +; CHECK-NEXT: str z1, [sp] +; CHECK-NEXT: st1w { z0.d }, p1, [sp, #2, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %c = load <vscale x 6 x i16>, ptr %a + %c.zext = sext <vscale x 6 x i16> %c to <vscale x 6 x i32> + ret <vscale x 6 x i32> %c.zext +} + +define <vscale x 7 x i32> @sve_zextload_nxv7i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv7i16: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #7 // =0x7 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.h, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0] +; CHECK-NEXT: st1w { z0.s }, p1, [sp, #1, mul vl] +; CHECK-NEXT: st1w { z1.s }, p0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %c = load <vscale x 7 x i16>, ptr %a + %c.zext = sext <vscale x 7 x i16> %c to <vscale x 7 x i32> + ret <vscale x 7 x i32> %c.zext +} + +define <vscale x 8 x i32> @sve_zextload_nxv8i16(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv8i16: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.s +; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0] +; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 8 x i16>, ptr %a + %c.zext = sext <vscale x 8 x i16> %c to <vscale x 8 x i32> + ret <vscale x 8 x i32> %c.zext +} + +define <vscale x 1 x i64> @sve_zextload_nxv1i32(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv1i32: +; CHECK: // %bb.0: +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: whilelo p0.d, xzr, x8 +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 1 x i32>, ptr %a + %c.zext = sext <vscale x 1 x i32> %c to <vscale x 1 x i64> + ret <vscale x 1 x i64> %c.zext +} + +define <vscale x 2 x i64> @sve_zextload_nxv2i32(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv2i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0] +; CHECK-NEXT: ret + %c = load <vscale x 2 x i32>, ptr %a + %c.zext = sext <vscale x 2 x i32> %c to <vscale x 2 x i64> + ret <vscale x 2 x i64> %c.zext +} + +define <vscale x 3 x i64> @sve_zextload_nxv3i32(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv3i32: +; CHECK: // %bb.0: +; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill +; CHECK-NEXT: addvl sp, sp, #-2 +; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG +; CHECK-NEXT: .cfi_offset w29, -16 +; CHECK-NEXT: rdvl x8, #1 +; CHECK-NEXT: mov w9, #3 // =0x3 +; CHECK-NEXT: lsr x8, x8, #4 +; CHECK-NEXT: mul x8, x8, x9 +; CHECK-NEXT: whilelo p0.s, xzr, x8 +; CHECK-NEXT: punpkhi p1.h, p0.b +; CHECK-NEXT: punpklo p0.h, p0.b +; CHECK-NEXT: ld1sw { z0.d }, p1/z, [x0, #1, mul vl] +; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0] +; CHECK-NEXT: st1d { z0.d }, p1, [sp, #1, mul vl] +; CHECK-NEXT: st1d { z1.d }, p0, [sp] +; CHECK-NEXT: ldr z1, [sp, #1, mul vl] +; CHECK-NEXT: ldr z0, [sp] +; CHECK-NEXT: addvl sp, sp, #2 +; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload +; CHECK-NEXT: ret + %c = load <vscale x 3 x i32>, ptr %a + %c.zext = sext <vscale x 3 x i32> %c to <vscale x 3 x i64> + ret <vscale x 3 x i64> %c.zext +} + +define <vscale x 4 x i64> @sve_zextload_nxv4i32(ptr %a, ptr %b) { +; CHECK-LABEL: sve_zextload_nxv4i32: +; CHECK: // %bb.0: +; CHECK-NEXT: ptrue p0.d +; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0] +; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0, #1, mul vl] +; CHECK-NEXT: ret + %c = load <vscale x 4 x i32>, ptr %a + %c.zext = sext <vscale x 4 x i32> %c to <vscale x 4 x i64> + ret <vscale x 4 x i64> %c.zext +} diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll index 2cbb29e..d8de12c 100644 --- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll +++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll @@ -672,5 +672,3 @@ entry: ret i32 %x } declare void @other() -;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: -; CHECK-FRAMELAYOUT: {{.*}} diff --git a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir index ca77482..fa52b96 100644 --- a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir +++ b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir @@ -1,19 +1,9 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 # RUN: llc -mtriple=amdgcn -run-pass register-coalescer -o - %s | FileCheck %s -# Check that coalescer does not create wider register tuple than in source - -# CHECK: - { id: 2, class: vreg_64, preferred-register: '', flags: [ ] } -# CHECK: - { id: 3, class: vreg_64, preferred-register: '', flags: [ ] } -# CHECK: - { id: 4, class: vreg_64, preferred-register: '', flags: [ ] } -# CHECK: - { id: 5, class: vreg_96, preferred-register: '', flags: [ ] } -# CHECK: - { id: 6, class: vreg_96, preferred-register: '', flags: [ ] } -# CHECK: - { id: 7, class: vreg_128, preferred-register: '', flags: [ ] } -# CHECK: - { id: 8, class: vreg_128, preferred-register: '', flags: [ ] } +# Check that coalescer does not create wider register tuple than in +# source. # No more registers shall be defined -# CHECK-NEXT: liveins: -# CHECK: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %4, -# CHECK: FLAT_STORE_DWORDX3 $vgpr0_vgpr1, %6, - --- name: main alignment: 1 @@ -52,6 +42,23 @@ body: | bb.0.entry: liveins: $sgpr0, $vgpr0_vgpr1 + ; CHECK-LABEL: name: main + ; CHECK: liveins: $sgpr0, $vgpr0_vgpr1 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF + ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $sgpr0 + ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY [[DEF]].sub0 + ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_64 = COPY [[COPY]].sub1 + ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:vreg_64 = COPY [[COPY]].sub0 + ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr + ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_96 = IMPLICIT_DEF + ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:vreg_96 = COPY [[DEF1]] + ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:vreg_96 = COPY [[DEF]].sub0 + ; CHECK-NEXT: FLAT_STORE_DWORDX3 $vgpr0_vgpr1, [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr + ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vreg_128 = IMPLICIT_DEF + ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1_sub2:vreg_128 = COPY [[DEF2]] + ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub3:vreg_128 = COPY [[DEF]].sub0 + ; CHECK-NEXT: FLAT_STORE_DWORDX4 $vgpr0_vgpr1, [[COPY3]], 0, 0, implicit $exec, implicit $flat_scr %3 = IMPLICIT_DEF undef %4.sub0 = COPY $sgpr0 %4.sub1 = COPY %3.sub0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll index de7d234..b9bf76c 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11 %s -; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s +; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s declare i32 @llvm.amdgcn.s.quadmask.i32(i32) declare i64 @llvm.amdgcn.s.quadmask.i64(i64) @@ -172,3 +172,91 @@ entry: %qm = call i64 @llvm.amdgcn.s.quadmask.i64(i64 %mask) ret i64 %qm } + +;; Ensure that AND/ICMP cannot be fused into an AND because s_quadmask_b32 implicitly defines SCC. +define amdgpu_kernel void @test_scc_quadmask_32(i32 %val0, i32 %val1, ptr addrspace(1) %ptr) { +; GFX11-GISEL-LABEL: test_scc_quadmask_32: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_and_b32 s0, s0, 1 +; GFX11-GISEL-NEXT: s_quadmask_b32 s1, s1 +; GFX11-GISEL-NEXT: s_cmp_eq_u32 s0, 0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, s1 +; GFX11-GISEL-NEXT: s_cselect_b32 s0, 1, 0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s0 +; GFX11-GISEL-NEXT: global_store_b32 v2, v3, s[2:3] +; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v4, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: test_scc_quadmask_32: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 1 +; GFX11-SDAG-NEXT: s_quadmask_b32 s1, s1 +; GFX11-SDAG-NEXT: s_cmp_eq_u32 s0, 0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1 +; GFX11-SDAG-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0 +; GFX11-SDAG-NEXT: global_store_b32 v2, v3, s[2:3] +; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v4, off +; GFX11-SDAG-NEXT: s_endpgm + %and = and i32 %val0, 1 + %result = call i32 @llvm.amdgcn.s.quadmask.i32(i32 %val1) nounwind readnone + store i32 %result, ptr addrspace(1) %ptr + %cmp = icmp eq i32 %and, 0 + %sel = select i1 %cmp, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) null, align 4 + ret void +} + +;; Ensure that AND/ICMP cannot be fused into an AND because s_quadmask_b64 implicitly defines SCC. +define amdgpu_kernel void @test_scc_quadmask_64(i32 %val0, i64 %val1, ptr addrspace(1) %ptr) { +; GFX11-GISEL-LABEL: test_scc_quadmask_64: +; GFX11-GISEL: ; %bb.0: +; GFX11-GISEL-NEXT: s_clause 0x1 +; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x24 +; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-GISEL-NEXT: s_quadmask_b64 s[0:1], s[0:1] +; GFX11-GISEL-NEXT: s_and_b32 s4, s4, 1 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s0 +; GFX11-GISEL-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1 +; GFX11-GISEL-NEXT: s_cselect_b32 s0, 1, 0 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v5, s0 +; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0 +; GFX11-GISEL-NEXT: global_store_b64 v4, v[0:1], s[2:3] +; GFX11-GISEL-NEXT: global_store_b32 v[2:3], v5, off +; GFX11-GISEL-NEXT: s_endpgm +; +; GFX11-SDAG-LABEL: test_scc_quadmask_64: +; GFX11-SDAG: ; %bb.0: +; GFX11-SDAG-NEXT: s_clause 0x1 +; GFX11-SDAG-NEXT: s_load_b32 s6, s[4:5], 0x24 +; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c +; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_and_b32 s4, s6, 1 +; GFX11-SDAG-NEXT: s_quadmask_b64 s[0:1], s[0:1] +; GFX11-SDAG-NEXT: s_cmp_eq_u32 s4, 0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-SDAG-NEXT: s_cselect_b32 s0, -1, 0 +; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0 +; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3] +; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v5, off +; GFX11-SDAG-NEXT: s_endpgm + %and = and i32 %val0, 1 + %result = call i64 @llvm.amdgcn.s.quadmask.i64(i64 %val1) nounwind readnone + store i64 %result, ptr addrspace(1) %ptr + %cmp = icmp eq i32 %and, 0 + %sel = select i1 %cmp, i32 1, i32 0 + store i32 %sel, ptr addrspace(1) null, align 4 + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll index 0de7f8f..bd29e9e 100644 --- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll +++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll @@ -1,5 +1,5 @@ ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals -; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s +; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s ; Regression test for issue 160181 ; One variable is chosen to be assigned at zero. Here, that's @both @@ -22,12 +22,20 @@ ;. ; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 4, !absolute_symbol [[META0:![0-9]+]] ; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata" +; CHECK: @llvm.amdgcn.kernel.kern_one.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kern_one.lds.t poison, align 4, !absolute_symbol [[META1:![0-9]+]] +; CHECK: @llvm.amdgcn.kernel.kern_two.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kern_two.lds.t poison, align 4, !absolute_symbol [[META1]] +; CHECK: @llvm.amdgcn.kernel.kern_block_direct_allocation.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kern_block_direct_allocation.lds.t poison, align 4, !absolute_symbol [[META1]] + ;. define void @func_one() { ; CHECK-LABEL: define {{[^@]+}}@func_one() { -; CHECK-NEXT: [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META1:![0-9]+]] -; CHECK-NEXT: store i32 [[VAL0]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META18:![0-9]+]] -; CHECK-NEXT: store i16 10, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 3), align 4, !noalias [[META23:![0-9]+]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() +; CHECK-NEXT: [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META2:![0-9]+]] +; CHECK-NEXT: [[ONE:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[ONE]], align 4 +; CHECK-NEXT: [[ONE1:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3) +; CHECK-NEXT: store i32 [[VAL0]], ptr addrspace(3) [[ONE1]], align 4 +; CHECK-NEXT: store i16 10, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META11:![0-9]+]] ; CHECK-NEXT: ret void ; %val0 = load i32, ptr addrspace(3) @both @@ -38,9 +46,10 @@ define void @func_one() { define amdgpu_kernel void @kern_one() { ; CHECK-LABEL: define {{[^@]+}}@kern_one -; CHECK-SAME: () #[[ATTR0:[0-9]+]] { +; CHECK-SAME: () #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META16:![0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !noalias [[META24:![0-9]+]] +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kern_one.lds) ] +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !noalias [[META17:![0-9]+]] ; CHECK-NEXT: call void @func_one() ; CHECK-NEXT: ret void ; @@ -51,9 +60,13 @@ entry: define void @func_two() { ; CHECK-LABEL: define {{[^@]+}}@func_two() { -; CHECK-NEXT: [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META1]] -; CHECK-NEXT: store i32 [[VAL0]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 2), align 4, !noalias [[META25:![0-9]+]] -; CHECK-NEXT: store i16 20, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 3), align 4, !noalias [[META23]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() +; CHECK-NEXT: [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META2]] +; CHECK-NEXT: [[TWO:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TWO]], align 4 +; CHECK-NEXT: [[TWO1:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3) +; CHECK-NEXT: store i32 [[VAL0]], ptr addrspace(3) [[TWO1]], align 4 +; CHECK-NEXT: store i16 20, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META11]] ; CHECK-NEXT: ret void ; %val0 = load i32, ptr addrspace(3) @both @@ -64,9 +77,10 @@ define void @func_two() { define amdgpu_kernel void @kern_two() { ; CHECK-LABEL: define {{[^@]+}}@kern_two -; CHECK-SAME: () #[[ATTR0]] { +; CHECK-SAME: () #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META18:![0-9]+]] { ; CHECK-NEXT: entry: -; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !alias.scope [[META26:![0-9]+]], !noalias [[META27:![0-9]+]] +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kern_two.lds) ] +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !alias.scope [[META19:![0-9]+]], !noalias [[META20:![0-9]+]] ; CHECK-NEXT: call void @func_two() ; CHECK-NEXT: ret void ; @@ -82,11 +96,18 @@ entry: ; remains the best candidate for address zero allocation. define void @func_block_direct_allocation() { ; CHECK-LABEL: define {{[^@]+}}@func_block_direct_allocation() { -; CHECK-NEXT: [[VAL1:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META18]] -; CHECK-NEXT: [[VAL2:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 2), align 4, !noalias [[META25]] +; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id() +; CHECK-NEXT: [[ONE:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[ONE]], align 4 +; CHECK-NEXT: [[ONE1:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3) +; CHECK-NEXT: [[VAL1:%.*]] = load i32, ptr addrspace(3) [[ONE1]], align 4 +; CHECK-NEXT: [[TWO:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1 +; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TWO]], align 4 +; CHECK-NEXT: [[TWO2:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3) +; CHECK-NEXT: [[VAL2:%.*]] = load i32, ptr addrspace(3) [[TWO2]], align 4 ; CHECK-NEXT: [[SUM:%.*]] = add i32 [[VAL1]], [[VAL2]] -; CHECK-NEXT: store i32 [[SUM]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META1]] -; CHECK-NEXT: store i16 30, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 3), align 4, !noalias [[META23]] +; CHECK-NEXT: store i32 [[SUM]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META2]] +; CHECK-NEXT: store i16 30, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META11]] ; CHECK-NEXT: ret void ; %val1 = load i32, ptr addrspace(3) @one @@ -99,7 +120,8 @@ define void @func_block_direct_allocation() { define amdgpu_kernel void @kern_block_direct_allocation() { ; CHECK-LABEL: define {{[^@]+}}@kern_block_direct_allocation -; CHECK-SAME: () #[[ATTR0]] { +; CHECK-SAME: () #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META21:![0-9]+]] { +; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kern_block_direct_allocation.lds) ], !alias.scope [[META22:![0-9]+]], !noalias [[META25:![0-9]+]] ; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ] ; CHECK-NEXT: call void @func_block_direct_allocation() ; CHECK-NEXT: call void @func_one() @@ -112,35 +134,8 @@ define amdgpu_kernel void @kern_block_direct_allocation() { ret void } ;. -; CHECK: attributes #[[ATTR0]] = { "amdgpu-lds-size"="16" } -; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } -;. -; CHECK: [[META0]] = !{i32 0, i32 1} -; CHECK: [[META1]] = !{[[META2:![0-9]+]], [[META4:![0-9]+]], [[META5:![0-9]+]], [[META6:![0-9]+]], [[META8:![0-9]+]], [[META9:![0-9]+]], [[META10:![0-9]+]], [[META12:![0-9]+]], [[META13:![0-9]+]], [[META14:![0-9]+]], [[META16:![0-9]+]], [[META17:![0-9]+]]} -; CHECK: [[META2]] = distinct !{[[META2]], [[META3:![0-9]+]]} -; CHECK: [[META3]] = distinct !{[[META3]]} -; CHECK: [[META4]] = distinct !{[[META4]], [[META3]]} -; CHECK: [[META5]] = distinct !{[[META5]], [[META3]]} -; CHECK: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]]} -; CHECK: [[META7]] = distinct !{[[META7]]} -; CHECK: [[META8]] = distinct !{[[META8]], [[META7]]} -; CHECK: [[META9]] = distinct !{[[META9]], [[META7]]} -; CHECK: [[META10]] = distinct !{[[META10]], [[META11:![0-9]+]]} -; CHECK: [[META11]] = distinct !{[[META11]]} -; CHECK: [[META12]] = distinct !{[[META12]], [[META11]]} -; CHECK: [[META13]] = distinct !{[[META13]], [[META11]]} -; CHECK: [[META14]] = distinct !{[[META14]], [[META15:![0-9]+]]} -; CHECK: [[META15]] = distinct !{[[META15]]} -; CHECK: [[META16]] = distinct !{[[META16]], [[META15]]} -; CHECK: [[META17]] = distinct !{[[META17]], [[META15]]} -; CHECK: [[META18]] = !{[[META19:![0-9]+]], [[META2]], [[META5]], [[META20:![0-9]+]], [[META6]], [[META9]], [[META21:![0-9]+]], [[META10]], [[META13]], [[META22:![0-9]+]], [[META14]], [[META17]]} -; CHECK: [[META19]] = distinct !{[[META19]], [[META3]]} -; CHECK: [[META20]] = distinct !{[[META20]], [[META7]]} -; CHECK: [[META21]] = distinct !{[[META21]], [[META11]]} -; CHECK: [[META22]] = distinct !{[[META22]], [[META15]]} -; CHECK: [[META23]] = !{[[META19]], [[META4]], [[META5]], [[META20]], [[META8]], [[META9]], [[META21]], [[META12]], [[META13]], [[META22]], [[META16]], [[META17]]} -; CHECK: [[META24]] = !{[[META10]], [[META12]], [[META13]], [[META14]], [[META16]], [[META17]]} -; CHECK: [[META25]] = !{[[META19]], [[META2]], [[META4]], [[META20]], [[META6]], [[META8]], [[META21]], [[META10]], [[META12]], [[META22]], [[META14]], [[META16]]} -; CHECK: [[META26]] = !{[[META22]]} -; CHECK: [[META27]] = !{[[META14]], [[META16]], [[META17]]} +; CHECK: attributes #[[ATTR0]] = { "amdgpu-lds-size"="12" } +; CHECK: attributes #[[ATTR1]] = { "amdgpu-lds-size"="16" } +; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) } +; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) } ;. diff --git a/llvm/test/CodeGen/Hexagon/unaligned-vec-store.ll b/llvm/test/CodeGen/Hexagon/unaligned-vec-store.ll new file mode 100644 index 0000000..267e365 --- /dev/null +++ b/llvm/test/CodeGen/Hexagon/unaligned-vec-store.ll @@ -0,0 +1,23 @@ +; RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128B < %s | FileCheck %s +; REQUIRES: asserts + +; Check that the test does not assert when unaligned vector store V6_vS32Ub_npred_ai is generated. +; CHECK: if (!p{{[0-3]}}) vmemu + +target triple = "hexagon-unknown-unknown-elf" + +define fastcc void @test(i1 %cmp.i.i) { +entry: + %call.i.i.i172 = load ptr, ptr null, align 4 + %add.ptr = getelementptr i8, ptr %call.i.i.i172, i32 1 + store <32 x i32> zeroinitializer, ptr %add.ptr, align 128 + %add.ptr4.i4 = getelementptr i8, ptr %call.i.i.i172, i32 129 + br i1 %cmp.i.i, label %common.ret, label %if.end.i.i + +common.ret: ; preds = %if.end.i.i, %entry + ret void + +if.end.i.i: ; preds = %entry + store <32 x i32> zeroinitializer, ptr %add.ptr4.i4, align 1 + br label %common.ret +} diff --git a/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll b/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll index 18fb879..21ca041 100644 --- a/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll +++ b/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll @@ -115,5 +115,150 @@ define ptx_kernel void @inlineasm(ptr %p) { store <2 x float> %mul, ptr %p, align 8 ret void } + +define ptx_kernel void @trunc_v2i32(<2 x i32> %0) { +; CHECK-SM90A-LABEL: trunc_v2i32( +; CHECK-SM90A: { +; CHECK-SM90A-NEXT: .reg .b32 %r<7>; +; CHECK-SM90A-NEXT: .reg .b64 %rd<2>; +; CHECK-SM90A-EMPTY: +; CHECK-SM90A-NEXT: // %bb.0: +; CHECK-SM90A-NEXT: ld.param.v2.b32 {%r1, %r2}, [trunc_v2i32_param_0]; +; CHECK-SM90A-NEXT: prmt.b32 %r3, %r1, %r2, 0x3340U; +; CHECK-SM90A-NEXT: mov.b32 %r4, 0; +; CHECK-SM90A-NEXT: prmt.b32 %r5, %r4, 0, 0x3340U; +; CHECK-SM90A-NEXT: prmt.b32 %r6, %r5, %r3, 0x5410U; +; CHECK-SM90A-NEXT: mov.b64 %rd1, 0; +; CHECK-SM90A-NEXT: st.b32 [%rd1], %r6; +; CHECK-SM90A-NEXT: ret; +; +; CHECK-SM100-LABEL: trunc_v2i32( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b32 %r<7>; +; CHECK-SM100-NEXT: .reg .b64 %rd<3>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.b64 %rd1, [trunc_v2i32_param_0]; +; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd1; +; CHECK-SM100-NEXT: mov.b32 %r3, 0; +; CHECK-SM100-NEXT: prmt.b32 %r4, %r3, 0, 0x3340U; +; CHECK-SM100-NEXT: prmt.b32 %r5, %r1, %r2, 0x3340U; +; CHECK-SM100-NEXT: prmt.b32 %r6, %r4, %r5, 0x5410U; +; CHECK-SM100-NEXT: mov.b64 %rd2, 0; +; CHECK-SM100-NEXT: st.b32 [%rd2], %r6; +; CHECK-SM100-NEXT: ret; + %2 = trunc <2 x i32> %0 to <2 x i8> + %3 = shufflevector <2 x i8> zeroinitializer, <2 x i8> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + store <4 x i8> %3, ptr null, align 4 + ret void +} + +define ptx_kernel void @zextend_to_v2i32(<2 x i8> %0) { +; CHECK-SM90A-LABEL: zextend_to_v2i32( +; CHECK-SM90A: { +; CHECK-SM90A-NEXT: .reg .b16 %rs<3>; +; CHECK-SM90A-NEXT: .reg .b32 %r<4>; +; CHECK-SM90A-NEXT: .reg .b64 %rd<5>; +; CHECK-SM90A-EMPTY: +; CHECK-SM90A-NEXT: // %bb.0: +; CHECK-SM90A-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [zextend_to_v2i32_param_0]; +; CHECK-SM90A-NEXT: mov.b32 %r1, {%rs1, %rs2}; +; CHECK-SM90A-NEXT: cvt.u32.u16 %r2, %rs1; +; CHECK-SM90A-NEXT: cvt.u32.u16 %r3, %rs2; +; CHECK-SM90A-NEXT: mov.b64 %rd1, 12; +; CHECK-SM90A-NEXT: st.b32 [%rd1], %r3; +; CHECK-SM90A-NEXT: mov.b64 %rd2, 8; +; CHECK-SM90A-NEXT: st.b32 [%rd2], %r2; +; CHECK-SM90A-NEXT: mov.b64 %rd3, 4; +; CHECK-SM90A-NEXT: st.b32 [%rd3], 0; +; CHECK-SM90A-NEXT: mov.b64 %rd4, 0; +; CHECK-SM90A-NEXT: st.b32 [%rd4], 0; +; CHECK-SM90A-NEXT: ret; +; +; CHECK-SM100-LABEL: zextend_to_v2i32( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b16 %rs<3>; +; CHECK-SM100-NEXT: .reg .b32 %r<5>; +; CHECK-SM100-NEXT: .reg .b64 %rd<8>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [zextend_to_v2i32_param_0]; +; CHECK-SM100-NEXT: mov.b32 %r1, {%rs1, %rs2}; +; CHECK-SM100-NEXT: cvt.u32.u16 %r2, %rs2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r3, %rs1; +; CHECK-SM100-NEXT: mov.b64 %rd1, {%r3, %r2}; +; CHECK-SM100-NEXT: mov.b32 %r4, 0; +; CHECK-SM100-NEXT: mov.b64 %rd2, {%r4, %r4}; +; CHECK-SM100-NEXT: mov.b64 %rd3, 4; +; CHECK-SM100-NEXT: st.b32 [%rd3], %rd2; +; CHECK-SM100-NEXT: mov.b64 %rd4, 0; +; CHECK-SM100-NEXT: st.b32 [%rd4], %rd2; +; CHECK-SM100-NEXT: mov.b64 %rd5, 8; +; CHECK-SM100-NEXT: st.b32 [%rd5], %rd1; +; CHECK-SM100-NEXT: shr.u64 %rd6, %rd1, 32; +; CHECK-SM100-NEXT: mov.b64 %rd7, 12; +; CHECK-SM100-NEXT: st.b32 [%rd7], %rd6; +; CHECK-SM100-NEXT: ret; + %2 = zext <2 x i8> %0 to <2 x i32> + %3 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + store <4 x i32> %3, ptr null, align 4 + ret void +} + +define ptx_kernel void @sextend_to_v2i32(<2 x i8> %0) { +; CHECK-SM90A-LABEL: sextend_to_v2i32( +; CHECK-SM90A: { +; CHECK-SM90A-NEXT: .reg .b16 %rs<3>; +; CHECK-SM90A-NEXT: .reg .b32 %r<6>; +; CHECK-SM90A-NEXT: .reg .b64 %rd<5>; +; CHECK-SM90A-EMPTY: +; CHECK-SM90A-NEXT: // %bb.0: +; CHECK-SM90A-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [sextend_to_v2i32_param_0]; +; CHECK-SM90A-NEXT: mov.b32 %r1, {%rs1, %rs2}; +; CHECK-SM90A-NEXT: cvt.u32.u16 %r2, %rs1; +; CHECK-SM90A-NEXT: cvt.s32.s8 %r3, %r2; +; CHECK-SM90A-NEXT: cvt.u32.u16 %r4, %rs2; +; CHECK-SM90A-NEXT: cvt.s32.s8 %r5, %r4; +; CHECK-SM90A-NEXT: mov.b64 %rd1, 12; +; CHECK-SM90A-NEXT: st.b32 [%rd1], %r5; +; CHECK-SM90A-NEXT: mov.b64 %rd2, 8; +; CHECK-SM90A-NEXT: st.b32 [%rd2], %r3; +; CHECK-SM90A-NEXT: mov.b64 %rd3, 4; +; CHECK-SM90A-NEXT: st.b32 [%rd3], 0; +; CHECK-SM90A-NEXT: mov.b64 %rd4, 0; +; CHECK-SM90A-NEXT: st.b32 [%rd4], 0; +; CHECK-SM90A-NEXT: ret; +; +; CHECK-SM100-LABEL: sextend_to_v2i32( +; CHECK-SM100: { +; CHECK-SM100-NEXT: .reg .b16 %rs<3>; +; CHECK-SM100-NEXT: .reg .b32 %r<7>; +; CHECK-SM100-NEXT: .reg .b64 %rd<8>; +; CHECK-SM100-EMPTY: +; CHECK-SM100-NEXT: // %bb.0: +; CHECK-SM100-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [sextend_to_v2i32_param_0]; +; CHECK-SM100-NEXT: mov.b32 %r1, {%rs1, %rs2}; +; CHECK-SM100-NEXT: cvt.u32.u16 %r2, %rs2; +; CHECK-SM100-NEXT: cvt.s32.s8 %r3, %r2; +; CHECK-SM100-NEXT: cvt.u32.u16 %r4, %rs1; +; CHECK-SM100-NEXT: cvt.s32.s8 %r5, %r4; +; CHECK-SM100-NEXT: mov.b64 %rd1, {%r5, %r3}; +; CHECK-SM100-NEXT: mov.b32 %r6, 0; +; CHECK-SM100-NEXT: mov.b64 %rd2, {%r6, %r6}; +; CHECK-SM100-NEXT: mov.b64 %rd3, 4; +; CHECK-SM100-NEXT: st.b32 [%rd3], %rd2; +; CHECK-SM100-NEXT: mov.b64 %rd4, 0; +; CHECK-SM100-NEXT: st.b32 [%rd4], %rd2; +; CHECK-SM100-NEXT: mov.b64 %rd5, 8; +; CHECK-SM100-NEXT: st.b32 [%rd5], %rd1; +; CHECK-SM100-NEXT: shr.u64 %rd6, %rd1, 32; +; CHECK-SM100-NEXT: mov.b64 %rd7, 12; +; CHECK-SM100-NEXT: st.b32 [%rd7], %rd6; +; CHECK-SM100-NEXT: ret; + %2 = sext <2 x i8> %0 to <2 x i32> + %3 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + store <4 x i32> %3, ptr null, align 4 + ret void +} ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: ; CHECK: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/rvv/remat.ll b/llvm/test/CodeGen/RISCV/rvv/remat.ll index 06d54fa..95bff27 100644 --- a/llvm/test/CodeGen/RISCV/rvv/remat.ll +++ b/llvm/test/CodeGen/RISCV/rvv/remat.ll @@ -301,3 +301,135 @@ define void @vfmv.s.f(ptr %p, double %x) { store volatile double %x, ptr %p ret void } + +; This test is fairly fragile, but it's trying to cover the case which +; caused the revert of bba9172 due to interaction with how rematerialize +; instructions are pruned from the original live interval. In the result +; below, we remat the vmv.v.x into the loop, but fail to remat the vmv.v.x +; a second time after further splitting it's live range. We shouldn't need +; to spill it to the stack at all. +define i64 @dual_remat(i64 %0, <vscale x 16 x i64> %1, <vscale x 16 x i64> %2, ptr %p) #0 { +; CHECK-LABEL: dual_remat: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: addi sp, sp, -16 +; CHECK-NEXT: .cfi_def_cfa_offset 16 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 5 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: sub sp, sp, a1 +; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a1, a1, 3 +; CHECK-NEXT: add a1, sp, a1 +; CHECK-NEXT: addi a1, a1, 16 +; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: addi a1, sp, 16 +; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma +; CHECK-NEXT: vmv.v.i v16, 0 +; CHECK-NEXT: csrr a2, vlenb +; CHECK-NEXT: srli a1, a2, 3 +; CHECK-NEXT: slli a2, a2, 3 +; CHECK-NEXT: add a2, a3, a2 +; CHECK-NEXT: vmv.v.i v0, 0 +; CHECK-NEXT: .LBB8_1: # %vector.body +; CHECK-NEXT: # =>This Inner Loop Header: Depth=1 +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: mv a5, a4 +; CHECK-NEXT: slli a4, a4, 3 +; CHECK-NEXT: add a5, a5, a4 +; CHECK-NEXT: slli a4, a4, 1 +; CHECK-NEXT: add a4, a4, a5 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill +; CHECK-NEXT: vmv.v.x v8, a0 +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: slli a5, a4, 4 +; CHECK-NEXT: add a4, a5, a4 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: mv a5, a4 +; CHECK-NEXT: slli a4, a4, 3 +; CHECK-NEXT: add a5, a5, a4 +; CHECK-NEXT: slli a4, a4, 1 +; CHECK-NEXT: add a4, a4, a5 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vand.vv v16, v16, v8 +; CHECK-NEXT: vmsne.vi v24, v16, 0 +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: slli a4, a4, 4 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vs1r.v v24, (a4) # vscale x 8-byte Folded Spill +; CHECK-NEXT: vand.vv v16, v0, v8 +; CHECK-NEXT: vmsne.vi v8, v16, 0 +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: mv a5, a4 +; CHECK-NEXT: slli a4, a4, 3 +; CHECK-NEXT: add a5, a5, a4 +; CHECK-NEXT: slli a4, a4, 1 +; CHECK-NEXT: add a4, a4, a5 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload +; CHECK-NEXT: csrr a4, vlenb +; CHECK-NEXT: slli a4, a4, 4 +; CHECK-NEXT: add a4, sp, a4 +; CHECK-NEXT: addi a4, a4, 16 +; CHECK-NEXT: vl1r.v v9, (a4) # vscale x 8-byte Folded Reload +; CHECK-NEXT: vsetvli a4, zero, e8, mf4, ta, ma +; CHECK-NEXT: vslideup.vx v9, v8, a1 +; CHECK-NEXT: vsetvli a4, zero, e8, m2, ta, ma +; CHECK-NEXT: vcpop.m a4, v9 +; CHECK-NEXT: csrr a5, vlenb +; CHECK-NEXT: slli a6, a5, 4 +; CHECK-NEXT: add a5, a6, a5 +; CHECK-NEXT: add a5, sp, a5 +; CHECK-NEXT: addi a5, a5, 16 +; CHECK-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vs8r.v v8, (a3) +; CHECK-NEXT: vs8r.v v8, (a2) +; CHECK-NEXT: addi a5, sp, 16 +; CHECK-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vsetvli a5, zero, e64, m8, ta, ma +; CHECK-NEXT: vor.vv v16, v16, v8 +; CHECK-NEXT: csrr a5, vlenb +; CHECK-NEXT: slli a5, a5, 3 +; CHECK-NEXT: add a5, sp, a5 +; CHECK-NEXT: addi a5, a5, 16 +; CHECK-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload +; CHECK-NEXT: vor.vv v0, v0, v8 +; CHECK-NEXT: beqz a4, .LBB8_1 +; CHECK-NEXT: # %bb.2: # %middle.block +; CHECK-NEXT: andi a0, a0, 1 +; CHECK-NEXT: csrr a1, vlenb +; CHECK-NEXT: slli a2, a1, 5 +; CHECK-NEXT: add a1, a2, a1 +; CHECK-NEXT: add sp, sp, a1 +; CHECK-NEXT: .cfi_def_cfa sp, 16 +; CHECK-NEXT: addi sp, sp, 16 +; CHECK-NEXT: .cfi_def_cfa_offset 0 +; CHECK-NEXT: ret +entry: + %broadcast.splatinsert = insertelement <vscale x 16 x i64> zeroinitializer, i64 %0, i64 0 + %broadcast.splat = shufflevector <vscale x 16 x i64> %broadcast.splatinsert, <vscale x 16 x i64> zeroinitializer, <vscale x 16 x i32> zeroinitializer + br label %vector.body + +vector.body: ; preds = %vector.body, %entry + %vec.ind = phi <vscale x 16 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ] + %3 = and <vscale x 16 x i64> %vec.ind, %broadcast.splat + %4 = icmp ne <vscale x 16 x i64> %3, zeroinitializer + store <vscale x 16 x i64> %broadcast.splat, ptr %p + %5 = tail call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> %4) + %vec.ind.next = or <vscale x 16 x i64> %vec.ind, %1 + br i1 %5, label %middle.block, label %vector.body + +middle.block: ; preds = %vector.body + %and.i = and i64 1, %0 + ret i64 %and.i +} diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/ImplicitBinding.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/ImplicitBinding.ll index cd52498..2964da9 100644 --- a/llvm/test/CodeGen/SPIRV/hlsl-resources/ImplicitBinding.ll +++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/ImplicitBinding.ll @@ -32,6 +32,7 @@ ; CHECK-DAG: OpDecorate [[g]] Binding 0 ; CHECK-DAG: OpDecorate [[h]] DescriptorSet 10 ; CHECK-DAG: OpDecorate [[h]] Binding 3 +; CHECK-NOT: OpDecorate [[h]] Binding 4 ; CHECK-DAG: OpDecorate [[i]] DescriptorSet 10 ; CHECK-DAG: OpDecorate [[i]] Binding 2 @@ -44,30 +45,34 @@ entry: %3 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 0, i32 2, i32 1, i32 0, ptr nonnull @.str.6) %4 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 10, i32 1, i32 1, i32 0, ptr nonnull @.str.8) %5 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 2, i32 10, i32 1, i32 0, ptr nonnull @.str.10) - %6 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 3, i32 10, i32 1, i32 0, ptr nonnull @.str.12) - %7 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 10, i32 2, i32 1, i32 0, ptr nonnull @.str.14) - %8 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %1, i32 0) - %9 = load i32, ptr addrspace(11) %8, align 4 - %10 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %2, i32 0) - %11 = load i32, ptr addrspace(11) %10, align 4 - %add.i = add nsw i32 %11, %9 - %12 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %3, i32 0) - %13 = load i32, ptr addrspace(11) %12, align 4 - %add4.i = add nsw i32 %add.i, %13 - %14 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %4, i32 0) - %15 = load i32, ptr addrspace(11) %14, align 4 - %add6.i = add nsw i32 %add4.i, %15 - %16 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %5, i32 0) - %17 = load i32, ptr addrspace(11) %16, align 4 - %add8.i = add nsw i32 %add6.i, %17 - %18 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %6, i32 0) - %19 = load i32, ptr addrspace(11) %18, align 4 - %add10.i = add nsw i32 %add8.i, %19 - %20 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %7, i32 0) - %21 = load i32, ptr addrspace(11) %20, align 4 - %add12.i = add nsw i32 %add10.i, %21 - %22 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %0, i32 0) - store i32 %add12.i, ptr addrspace(11) %22, align 4 + %6 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 3, i32 10, i32 2, i32 0, ptr nonnull @.str.12) + %7 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 3, i32 10, i32 2, i32 1, ptr nonnull @.str.12) + %8 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 10, i32 2, i32 1, i32 0, ptr nonnull @.str.14) + %9 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %1, i32 0) + %10 = load i32, ptr addrspace(11) %9, align 4 + %11 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %2, i32 0) + %12 = load i32, ptr addrspace(11) %11, align 4 + %add.i = add nsw i32 %12, %10 + %13 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %3, i32 0) + %14 = load i32, ptr addrspace(11) %13, align 4 + %add4.i = add nsw i32 %add.i, %14 + %15 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %4, i32 0) + %16 = load i32, ptr addrspace(11) %15, align 4 + %add6.i = add nsw i32 %add4.i, %16 + %17 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %5, i32 0) + %18 = load i32, ptr addrspace(11) %17, align 4 + %add8.i = add nsw i32 %add6.i, %18 + %19 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %6, i32 0) + %20 = load i32, ptr addrspace(11) %19, align 4 + %add10.i = add nsw i32 %add8.i, %20 + %21 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %7, i32 0) + %22 = load i32, ptr addrspace(11) %21, align 4 + %add12.i = add nsw i32 %add10.i, %22 + %23 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %8, i32 0) + %24 = load i32, ptr addrspace(11) %23, align 4 + %add14.i = add nsw i32 %add12.i, %24 + %25 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %0, i32 0) + store i32 %add14.i, ptr addrspace(11) %25, align 4 ret void } diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/UniqueImplicitBindingNumber.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/UniqueImplicitBindingNumber.ll new file mode 100644 index 0000000..c968c99 --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/UniqueImplicitBindingNumber.ll @@ -0,0 +1,19 @@ +; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR +; CHECK-ERROR: LLVM ERROR: Implicit binding calls with the same order ID must have the same descriptor set + +@.str = private unnamed_addr constant [2 x i8] c"b\00", align 1 +@.str.2 = private unnamed_addr constant [2 x i8] c"c\00", align 1 + +define void @main() local_unnamed_addr #0 { +entry: + %0 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 0, i32 0, i32 1, i32 0, ptr nonnull @.str) + %1 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %0, i32 0) + %2 = load i32, ptr addrspace(11) %1, align 4 + %3 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 0, i32 1, i32 1, i32 0, ptr nonnull @.str.2) + %4 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %3, i32 0) + store i32 %2, ptr addrspace(11) %4, align 4 + ret void +} + + +attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" } diff --git a/llvm/test/CodeGen/SystemZ/fp-cmp-04.ll b/llvm/test/CodeGen/SystemZ/fp-cmp-04.ll index d3d6413..eb7c1b6 100644 --- a/llvm/test/CodeGen/SystemZ/fp-cmp-04.ll +++ b/llvm/test/CodeGen/SystemZ/fp-cmp-04.ll @@ -235,7 +235,7 @@ define half @f12_half(half %dummy, half %val, ptr %dest) { ; CHECK-NEXT: blah %f0 ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT -; CHECK-NEXT: ltebr %f0, %f0 +; CHECK-NEXT: ltebr %f1, %f0 ; CHECK-NEXT: jl .LBB11_2 ; CHECK-NEXT:# %bb.1: ; CHECK-NEXT: lgdr %r0, %f8 @@ -344,7 +344,7 @@ define half @f15_half(half %val, half %dummy, ptr %dest) { ; CHECK-NEXT: blah %f2 ; CHECK-NEXT: #NO_APP ; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT -; CHECK-NEXT: ltebr %f0, %f0 +; CHECK-NEXT: ltebr %f1, %f0 ; CHECK-NEXT: jl .LBB15_2 ; CHECK-NEXT:# %bb.1: ; CHECK-NEXT: lgdr %r0, %f8 diff --git a/llvm/test/CodeGen/VE/Vector/vec_divrem.ll b/llvm/test/CodeGen/VE/Vector/vec_divrem.ll index 3bc0aba..93e2889 100644 --- a/llvm/test/CodeGen/VE/Vector/vec_divrem.ll +++ b/llvm/test/CodeGen/VE/Vector/vec_divrem.ll @@ -7,19 +7,22 @@ define <4 x i8> @udiv_by_minus_one(<4 x i8> %x) { ; CHECK-LABEL: udiv_by_minus_one: ; CHECK: # %bb.0: -; CHECK-NEXT: and %s0, %s0, (56)0 -; CHECK-NEXT: lea %s4, 16843010 -; CHECK-NEXT: muls.l %s0, %s0, %s4 -; CHECK-NEXT: srl %s0, %s0, 32 +; CHECK-NEXT: and %s4, %s0, (56)0 ; CHECK-NEXT: and %s1, %s1, (56)0 -; CHECK-NEXT: muls.l %s1, %s1, %s4 -; CHECK-NEXT: srl %s1, %s1, 32 ; CHECK-NEXT: and %s2, %s2, (56)0 -; CHECK-NEXT: muls.l %s2, %s2, %s4 -; CHECK-NEXT: srl %s2, %s2, 32 ; CHECK-NEXT: and %s3, %s3, (56)0 -; CHECK-NEXT: muls.l %s3, %s3, %s4 -; CHECK-NEXT: srl %s3, %s3, 32 +; CHECK-NEXT: or %s0, 0, (0)1 +; CHECK-NEXT: cmpu.w %s5, %s3, (56)0 +; CHECK-NEXT: or %s3, 0, (0)1 +; CHECK-NEXT: cmov.w.eq %s3, (63)0, %s5 +; CHECK-NEXT: cmpu.w %s5, %s2, (56)0 +; CHECK-NEXT: or %s2, 0, (0)1 +; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s5 +; CHECK-NEXT: cmpu.w %s5, %s1, (56)0 +; CHECK-NEXT: or %s1, 0, (0)1 +; CHECK-NEXT: cmov.w.eq %s1, (63)0, %s5 +; CHECK-NEXT: cmpu.w %s4, %s4, (56)0 +; CHECK-NEXT: cmov.w.eq %s0, (63)0, %s4 ; CHECK-NEXT: b.l.t (, %s10) %r = udiv <4 x i8> %x, <i8 255, i8 255, i8 255, i8 255> ret <4 x i8> %r @@ -28,27 +31,18 @@ define <4 x i8> @udiv_by_minus_one(<4 x i8> %x) { define <4 x i8> @urem_by_minus_one(<4 x i8> %x) { ; CHECK-LABEL: urem_by_minus_one: ; CHECK: # %bb.0: -; CHECK-NEXT: and %s0, %s0, (56)0 -; CHECK-NEXT: and %s1, %s1, (56)0 -; CHECK-NEXT: and %s2, %s2, (56)0 -; CHECK-NEXT: and %s3, %s3, (56)0 -; CHECK-NEXT: lea %s4, 16843010 -; CHECK-NEXT: muls.l %s5, %s3, %s4 -; CHECK-NEXT: srl %s5, %s5, 32 -; CHECK-NEXT: muls.w.sx %s5, %s5, (56)0 -; CHECK-NEXT: subs.w.sx %s3, %s3, %s5 -; CHECK-NEXT: muls.l %s5, %s2, %s4 -; CHECK-NEXT: srl %s5, %s5, 32 -; CHECK-NEXT: muls.w.sx %s5, %s5, (56)0 -; CHECK-NEXT: subs.w.sx %s2, %s2, %s5 -; CHECK-NEXT: muls.l %s5, %s1, %s4 -; CHECK-NEXT: srl %s5, %s5, 32 -; CHECK-NEXT: muls.w.sx %s5, %s5, (56)0 -; CHECK-NEXT: subs.w.sx %s1, %s1, %s5 -; CHECK-NEXT: muls.l %s4, %s0, %s4 -; CHECK-NEXT: srl %s4, %s4, 32 -; CHECK-NEXT: muls.w.sx %s4, %s4, (56)0 -; CHECK-NEXT: subs.w.sx %s0, %s0, %s4 +; CHECK-NEXT: and %s4, %s0, (56)0 +; CHECK-NEXT: and %s5, %s1, (56)0 +; CHECK-NEXT: and %s6, %s2, (56)0 +; CHECK-NEXT: and %s7, %s3, (56)0 +; CHECK-NEXT: cmpu.w %s7, %s7, (56)0 +; CHECK-NEXT: cmov.w.eq %s3, (0)1, %s7 +; CHECK-NEXT: cmpu.w %s6, %s6, (56)0 +; CHECK-NEXT: cmov.w.eq %s2, (0)1, %s6 +; CHECK-NEXT: cmpu.w %s5, %s5, (56)0 +; CHECK-NEXT: cmov.w.eq %s1, (0)1, %s5 +; CHECK-NEXT: cmpu.w %s4, %s4, (56)0 +; CHECK-NEXT: cmov.w.eq %s0, (0)1, %s4 ; CHECK-NEXT: b.l.t (, %s10) %r = urem <4 x i8> %x, <i8 255, i8 255, i8 255, i8 255> ret <4 x i8> %r diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll index ec1b8a3..f998128 100644 --- a/llvm/test/CodeGen/X86/fshl.ll +++ b/llvm/test/CodeGen/X86/fshl.ll @@ -335,84 +335,83 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-SLOW-NEXT: pushl %esi ; X86-SLOW-NEXT: andl $-16, %esp ; X86-SLOW-NEXT: subl $32, %esp -; X86-SLOW-NEXT: movl 24(%ebp), %esi +; X86-SLOW-NEXT: movl 24(%ebp), %edi ; X86-SLOW-NEXT: movl 28(%ebp), %eax ; X86-SLOW-NEXT: movl 48(%ebp), %edx ; X86-SLOW-NEXT: movl 56(%ebp), %ecx ; X86-SLOW-NEXT: testb $64, %cl -; X86-SLOW-NEXT: movl 52(%ebp), %edi +; X86-SLOW-NEXT: movl 52(%ebp), %ebx ; X86-SLOW-NEXT: jne .LBB6_1 ; X86-SLOW-NEXT: # %bb.2: ; X86-SLOW-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl %esi, %edx -; X86-SLOW-NEXT: movl 32(%ebp), %esi -; X86-SLOW-NEXT: movl %edi, %ecx -; X86-SLOW-NEXT: movl %eax, %edi +; X86-SLOW-NEXT: movl %edi, %edx +; X86-SLOW-NEXT: movl 32(%ebp), %edi +; X86-SLOW-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %eax, %ebx ; X86-SLOW-NEXT: movl 36(%ebp), %eax ; X86-SLOW-NEXT: jmp .LBB6_3 ; X86-SLOW-NEXT: .LBB6_1: ; X86-SLOW-NEXT: movl 40(%ebp), %ecx ; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SLOW-NEXT: movl 44(%ebp), %ecx +; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SLOW-NEXT: .LBB6_3: -; X86-SLOW-NEXT: movl 56(%ebp), %ebx -; X86-SLOW-NEXT: testb $32, %bl +; X86-SLOW-NEXT: movl 56(%ebp), %ecx +; X86-SLOW-NEXT: testb $32, %cl ; X86-SLOW-NEXT: jne .LBB6_4 ; X86-SLOW-NEXT: # %bb.5: -; X86-SLOW-NEXT: movl %ecx, %ebx ; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %edx, %edi +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SLOW-NEXT: jmp .LBB6_6 ; X86-SLOW-NEXT: .LBB6_4: -; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl %ecx, %edx -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload +; X86-SLOW-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %edx, %ebx +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-SLOW-NEXT: .LBB6_6: -; X86-SLOW-NEXT: movl %edx, %esi +; X86-SLOW-NEXT: movl %edi, %eax +; X86-SLOW-NEXT: shll %cl, %eax +; X86-SLOW-NEXT: shrl %esi +; X86-SLOW-NEXT: movl %ecx, %edx +; X86-SLOW-NEXT: notb %dl +; X86-SLOW-NEXT: movl %edx, %ecx +; X86-SLOW-NEXT: shrl %cl, %esi +; X86-SLOW-NEXT: orl %eax, %esi +; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %ebx, %eax ; X86-SLOW-NEXT: movl 56(%ebp), %ecx -; X86-SLOW-NEXT: shll %cl, %esi -; X86-SLOW-NEXT: movl %ebx, %edi +; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SLOW-NEXT: shll %cl, %eax ; X86-SLOW-NEXT: shrl %edi -; X86-SLOW-NEXT: movl %ecx, %ebx -; X86-SLOW-NEXT: notb %bl -; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill +; X86-SLOW-NEXT: movl %edx, %ecx ; X86-SLOW-NEXT: shrl %cl, %edi -; X86-SLOW-NEXT: orl %esi, %edi +; X86-SLOW-NEXT: orl %eax, %edi ; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload ; X86-SLOW-NEXT: movl %esi, %eax ; X86-SLOW-NEXT: movl 56(%ebp), %ecx ; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-SLOW-NEXT: shll %cl, %eax -; X86-SLOW-NEXT: shrl %edx -; X86-SLOW-NEXT: movl %ebx, %ecx -; X86-SLOW-NEXT: shrl %cl, %edx -; X86-SLOW-NEXT: orl %eax, %edx -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-SLOW-NEXT: movl %ebx, %eax +; X86-SLOW-NEXT: shrl %ebx +; X86-SLOW-NEXT: movl %edx, %ecx +; X86-SLOW-NEXT: shrl %cl, %ebx +; X86-SLOW-NEXT: orl %eax, %ebx ; X86-SLOW-NEXT: movl 56(%ebp), %ecx ; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload ; X86-SLOW-NEXT: shll %cl, %eax ; X86-SLOW-NEXT: shrl %esi -; X86-SLOW-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload +; X86-SLOW-NEXT: movl %edx, %ecx ; X86-SLOW-NEXT: shrl %cl, %esi ; X86-SLOW-NEXT: orl %eax, %esi -; X86-SLOW-NEXT: movl 56(%ebp), %ecx -; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; X86-SLOW-NEXT: shll %cl, %eax -; X86-SLOW-NEXT: shrl %ebx -; X86-SLOW-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload -; X86-SLOW-NEXT: shrl %cl, %ebx -; X86-SLOW-NEXT: orl %eax, %ebx ; X86-SLOW-NEXT: movl 8(%ebp), %eax -; X86-SLOW-NEXT: movl %ebx, 12(%eax) -; X86-SLOW-NEXT: movl %esi, 8(%eax) -; X86-SLOW-NEXT: movl %edx, 4(%eax) -; X86-SLOW-NEXT: movl %edi, (%eax) +; X86-SLOW-NEXT: movl %esi, 12(%eax) +; X86-SLOW-NEXT: movl %ebx, 8(%eax) +; X86-SLOW-NEXT: movl %edi, 4(%eax) +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload +; X86-SLOW-NEXT: movl %ecx, (%eax) ; X86-SLOW-NEXT: leal -12(%ebp), %esp ; X86-SLOW-NEXT: popl %esi ; X86-SLOW-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll index 544ab7f..c307833 100644 --- a/llvm/test/CodeGen/X86/fshr.ll +++ b/llvm/test/CodeGen/X86/fshr.ll @@ -322,79 +322,79 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind { ; X86-SLOW-NEXT: subl $16, %esp ; X86-SLOW-NEXT: movl 24(%ebp), %edx ; X86-SLOW-NEXT: movl 28(%ebp), %esi -; X86-SLOW-NEXT: movl 48(%ebp), %ebx +; X86-SLOW-NEXT: movl 48(%ebp), %edi ; X86-SLOW-NEXT: movl 56(%ebp), %eax ; X86-SLOW-NEXT: testb $64, %al -; X86-SLOW-NEXT: movl 52(%ebp), %edi +; X86-SLOW-NEXT: movl 52(%ebp), %eax ; X86-SLOW-NEXT: je .LBB6_1 ; X86-SLOW-NEXT: # %bb.2: -; X86-SLOW-NEXT: movl %ebx, (%esp) # 4-byte Spill -; X86-SLOW-NEXT: movl %edx, %ebx +; X86-SLOW-NEXT: movl %edi, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movl %edx, %edi ; X86-SLOW-NEXT: movl 32(%ebp), %edx -; X86-SLOW-NEXT: movl %edi, %eax -; X86-SLOW-NEXT: movl %esi, %edi +; X86-SLOW-NEXT: movl %eax, %ecx +; X86-SLOW-NEXT: movl %esi, %eax ; X86-SLOW-NEXT: movl 36(%ebp), %esi ; X86-SLOW-NEXT: jmp .LBB6_3 ; X86-SLOW-NEXT: .LBB6_1: -; X86-SLOW-NEXT: movl 40(%ebp), %eax -; X86-SLOW-NEXT: movl %eax, (%esp) # 4-byte Spill -; X86-SLOW-NEXT: movl 44(%ebp), %eax +; X86-SLOW-NEXT: movl 40(%ebp), %ecx +; X86-SLOW-NEXT: movl %ecx, (%esp) # 4-byte Spill +; X86-SLOW-NEXT: movl 44(%ebp), %ecx ; X86-SLOW-NEXT: .LBB6_3: -; X86-SLOW-NEXT: movl 56(%ebp), %ecx -; X86-SLOW-NEXT: testb $32, %cl +; X86-SLOW-NEXT: movl 56(%ebp), %ebx +; X86-SLOW-NEXT: testb $32, %bl ; X86-SLOW-NEXT: je .LBB6_4 ; X86-SLOW-NEXT: # %bb.5: -; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SLOW-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl %ecx, %ebx ; X86-SLOW-NEXT: jmp .LBB6_6 ; X86-SLOW-NEXT: .LBB6_4: ; X86-SLOW-NEXT: movl %edx, %esi +; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill ; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill -; X86-SLOW-NEXT: movl %eax, %ebx -; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload +; X86-SLOW-NEXT: movl %ecx, %edi +; X86-SLOW-NEXT: movl (%esp), %ebx # 4-byte Reload ; X86-SLOW-NEXT: .LBB6_6: -; X86-SLOW-NEXT: shrl %cl, %eax -; X86-SLOW-NEXT: movl %eax, %edx -; X86-SLOW-NEXT: movl %ecx, %eax -; X86-SLOW-NEXT: notb %al -; X86-SLOW-NEXT: movl %ebx, %edi -; X86-SLOW-NEXT: addl %ebx, %ebx -; X86-SLOW-NEXT: movl %eax, %ecx -; X86-SLOW-NEXT: shll %cl, %ebx -; X86-SLOW-NEXT: orl %edx, %ebx -; X86-SLOW-NEXT: movl %ebx, (%esp) # 4-byte Spill ; X86-SLOW-NEXT: movl 56(%ebp), %ecx -; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SLOW-NEXT: shrl %cl, %edi -; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload -; X86-SLOW-NEXT: leal (%ebx,%ebx), %edx -; X86-SLOW-NEXT: movl %eax, %ecx -; X86-SLOW-NEXT: shll %cl, %edx -; X86-SLOW-NEXT: orl %edi, %edx +; X86-SLOW-NEXT: shrl %cl, %ebx +; X86-SLOW-NEXT: movl %ecx, %edx +; X86-SLOW-NEXT: notb %dl +; X86-SLOW-NEXT: movl %edi, %eax +; X86-SLOW-NEXT: addl %edi, %edi +; X86-SLOW-NEXT: movl %edx, %ecx +; X86-SLOW-NEXT: shll %cl, %edi +; X86-SLOW-NEXT: orl %ebx, %edi +; X86-SLOW-NEXT: movl %edi, (%esp) # 4-byte Spill ; X86-SLOW-NEXT: movl 56(%ebp), %ecx ; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx -; X86-SLOW-NEXT: shrl %cl, %ebx -; X86-SLOW-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: shrl %cl, %eax ; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; X86-SLOW-NEXT: leal (%edi,%edi), %ebx -; X86-SLOW-NEXT: movl %eax, %ecx +; X86-SLOW-NEXT: movl %edx, %ecx ; X86-SLOW-NEXT: shll %cl, %ebx -; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload +; X86-SLOW-NEXT: orl %eax, %ebx ; X86-SLOW-NEXT: movl 56(%ebp), %ecx ; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx ; X86-SLOW-NEXT: shrl %cl, %edi +; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill +; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload +; X86-SLOW-NEXT: leal (%eax,%eax), %edi +; X86-SLOW-NEXT: movl %edx, %ecx +; X86-SLOW-NEXT: shll %cl, %edi +; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload +; X86-SLOW-NEXT: movl 56(%ebp), %ecx +; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx +; X86-SLOW-NEXT: shrl %cl, %eax ; X86-SLOW-NEXT: addl %esi, %esi -; X86-SLOW-NEXT: movl %eax, %ecx +; X86-SLOW-NEXT: movl %edx, %ecx ; X86-SLOW-NEXT: shll %cl, %esi -; X86-SLOW-NEXT: orl %edi, %esi -; X86-SLOW-NEXT: movl 8(%ebp), %ecx -; X86-SLOW-NEXT: movl %esi, 12(%ecx) -; X86-SLOW-NEXT: movl %ebx, 8(%ecx) -; X86-SLOW-NEXT: movl %edx, 4(%ecx) -; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload -; X86-SLOW-NEXT: movl %eax, (%ecx) -; X86-SLOW-NEXT: movl %ecx, %eax +; X86-SLOW-NEXT: orl %eax, %esi +; X86-SLOW-NEXT: movl 8(%ebp), %eax +; X86-SLOW-NEXT: movl %esi, 12(%eax) +; X86-SLOW-NEXT: movl %edi, 8(%eax) +; X86-SLOW-NEXT: movl %ebx, 4(%eax) +; X86-SLOW-NEXT: movl (%esp), %ecx # 4-byte Reload +; X86-SLOW-NEXT: movl %ecx, (%eax) ; X86-SLOW-NEXT: leal -12(%ebp), %esp ; X86-SLOW-NEXT: popl %esi ; X86-SLOW-NEXT: popl %edi diff --git a/llvm/test/CodeGen/X86/sbb.ll b/llvm/test/CodeGen/X86/sbb.ll index 78d609d..f5a3468 100644 --- a/llvm/test/CodeGen/X86/sbb.ll +++ b/llvm/test/CodeGen/X86/sbb.ll @@ -365,3 +365,32 @@ define i32 @uge_sext_add(i32 %0, i32 %1, i32 %2) { %6 = add nsw i32 %5, %0 ret i32 %6 } + +define i32 @sub_sub_ugt(i32 %a, i32 %b) { +; CHECK-LABEL: sub_sub_ugt: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: cmpl %edi, %esi +; CHECK-NEXT: sbbl %esi, %eax +; CHECK-NEXT: retq + %cmp = icmp ugt i32 %a, %b + %conv = zext i1 %cmp to i32 + %sub = sub i32 %a, %b + %res = sub i32 %sub, %conv + ret i32 %res +} + +define i32 @sub_sub_ult(i32 %a, i32 %b) { +; CHECK-LABEL: sub_sub_ult: +; CHECK: # %bb.0: +; CHECK-NEXT: movl %edi, %eax +; CHECK-NEXT: cmpl %edi, %esi +; CHECK-NEXT: sbbl %esi, %eax +; CHECK-NEXT: retq + %cmp = icmp ult i32 %b, %a + %conv = zext i1 %cmp to i32 + %sub = sub i32 %a, %b + %res = sub i32 %sub, %conv + ret i32 %res +} + diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll index 7462c77..049ee47 100644 --- a/llvm/test/CodeGen/X86/shift-i128.ll +++ b/llvm/test/CodeGen/X86/shift-i128.ll @@ -613,8 +613,7 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou ; i686-NEXT: shldl %cl, %esi, %ebx ; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload ; i686-NEXT: movl %edi, %esi -; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload -; i686-NEXT: movl %eax, %ecx +; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload ; i686-NEXT: shll %cl, %esi ; i686-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill ; i686-NEXT: negl %edx |