aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen')
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/legalize-modf.mir206
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir4
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/select-modf.mir136
-rw-r--r--llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir8
-rw-r--r--llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir4
-rw-r--r--llvm/test/CodeGen/AArch64/framelayout-split-sve.mir587
-rw-r--r--llvm/test/CodeGen/AArch64/framelayout-sve.mir12
-rw-r--r--llvm/test/CodeGen/AArch64/freeze.ll6
-rw-r--r--llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll20
-rw-r--r--llvm/test/CodeGen/AArch64/llvm.modf.ll459
-rw-r--r--llvm/test/CodeGen/AArch64/pr161420.ll13
-rw-r--r--llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll5
-rw-r--r--llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll5
-rw-r--r--llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir16
-rw-r--r--llvm/test/CodeGen/AArch64/spillfill-sve.mir10
-rw-r--r--llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll824
-rw-r--r--llvm/test/CodeGen/AArch64/stack-hazard.ll876
-rw-r--r--llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll10
-rw-r--r--llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll2854
-rw-r--r--llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll331
-rw-r--r--llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll14
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/limit-coalesce.mir33
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll92
-rw-r--r--llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll89
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/memmove-var-size.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll75
-rw-r--r--llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll7
-rw-r--r--llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll94
-rw-r--r--llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll27
-rw-r--r--llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll29
-rw-r--r--llvm/test/CodeGen/Hexagon/unaligned-vec-store.ll23
-rw-r--r--llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll145
-rw-r--r--llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir7
-rw-r--r--llvm/test/CodeGen/RISCV/rvv/remat.ll132
-rw-r--r--llvm/test/CodeGen/SPIRV/hlsl-resources/ImplicitBinding.ll53
-rw-r--r--llvm/test/CodeGen/SPIRV/hlsl-resources/UniqueImplicitBindingNumber.ll19
-rw-r--r--llvm/test/CodeGen/SystemZ/fp-cmp-04.ll4
-rw-r--r--llvm/test/CodeGen/VE/Vector/vec_divrem.ll56
-rw-r--r--llvm/test/CodeGen/X86/fshl.ll81
-rw-r--r--llvm/test/CodeGen/X86/fshr.ll90
-rw-r--r--llvm/test/CodeGen/X86/sbb.ll29
-rw-r--r--llvm/test/CodeGen/X86/shift-i128.ll3
51 files changed, 6479 insertions, 1113 deletions
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalize-modf.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-modf.mir
new file mode 100644
index 0000000..36ac7eb
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalize-modf.mir
@@ -0,0 +1,206 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -mtriple=aarch64 -run-pass=legalizer %s -o - | FileCheck %s
+---
+name: test_modf_f16
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: test_modf_f16
+ ; CHECK: [[COPY:%[0-9]+]]:_(s16) = COPY $h0
+ ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[COPY]](s16)
+ ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: $s0 = COPY [[FPEXT]](s32)
+ ; CHECK-NEXT: $x0 = COPY [[FRAME_INDEX]](p0)
+ ; CHECK-NEXT: BL &modff, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $s0, implicit $x0, implicit-def $s0
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $s0
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s32) from %stack.0)
+ ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[LOAD]](s32)
+ ; CHECK-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[COPY1]](s32)
+ ; CHECK-NEXT: $h0 = COPY [[FPTRUNC1]](s16)
+ ; CHECK-NEXT: $h1 = COPY [[FPTRUNC]](s16)
+ ; CHECK-NEXT: RET_ReallyLR implicit $h0, implicit $h1
+ %0:_(s16) = COPY $h0
+ %1:_(s16), %2:_(s16) = G_FMODF %0
+ $h0 = COPY %1(s16)
+ $h1 = COPY %2(s16)
+ RET_ReallyLR implicit $h0, implicit $h1
+...
+---
+name: test_modf_f16_only_use_fractional_part
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: test_modf_f16_only_use_fractional_part
+ ; CHECK: [[COPY:%[0-9]+]]:_(s16) = COPY $h0
+ ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[COPY]](s16)
+ ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: $s0 = COPY [[FPEXT]](s32)
+ ; CHECK-NEXT: $x0 = COPY [[FRAME_INDEX]](p0)
+ ; CHECK-NEXT: BL &modff, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $s0, implicit $x0, implicit-def $s0
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $s0
+ ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[COPY1]](s32)
+ ; CHECK-NEXT: $h0 = COPY [[FPTRUNC]](s16)
+ ; CHECK-NEXT: RET_ReallyLR implicit $h0
+ %0:_(s16) = COPY $h0
+ %1:_(s16), %2:_(s16) = G_FMODF %0
+ $h0 = COPY %1(s16)
+ RET_ReallyLR implicit $h0
+...
+---
+name: test_modf_v2f16
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: test_modf_v2f16
+ ; CHECK: [[COPY:%[0-9]+]]:_(<4 x s16>) = COPY $d0
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s16), [[UV1:%[0-9]+]]:_(s16), [[UV2:%[0-9]+]]:_(s16), [[UV3:%[0-9]+]]:_(s16) = G_UNMERGE_VALUES [[COPY]](<4 x s16>)
+ ; CHECK-NEXT: [[FPEXT:%[0-9]+]]:_(s32) = G_FPEXT [[UV]](s16)
+ ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.1
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: $s0 = COPY [[FPEXT]](s32)
+ ; CHECK-NEXT: $x0 = COPY [[FRAME_INDEX]](p0)
+ ; CHECK-NEXT: BL &modff, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $s0, implicit $x0, implicit-def $s0
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $s0
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s32) from %stack.1)
+ ; CHECK-NEXT: [[FPTRUNC:%[0-9]+]]:_(s16) = G_FPTRUNC [[LOAD]](s32)
+ ; CHECK-NEXT: [[FPTRUNC1:%[0-9]+]]:_(s16) = G_FPTRUNC [[COPY1]](s32)
+ ; CHECK-NEXT: [[FPEXT1:%[0-9]+]]:_(s32) = G_FPEXT [[UV1]](s16)
+ ; CHECK-NEXT: [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: $s0 = COPY [[FPEXT1]](s32)
+ ; CHECK-NEXT: $x0 = COPY [[FRAME_INDEX1]](p0)
+ ; CHECK-NEXT: BL &modff, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $s0, implicit $x0, implicit-def $s0
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $s0
+ ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p0) :: (load (s32) from %stack.0)
+ ; CHECK-NEXT: [[FPTRUNC2:%[0-9]+]]:_(s16) = G_FPTRUNC [[LOAD1]](s32)
+ ; CHECK-NEXT: [[FPTRUNC3:%[0-9]+]]:_(s16) = G_FPTRUNC [[COPY2]](s32)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s16) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[FPTRUNC1]](s16), [[FPTRUNC3]](s16), [[DEF]](s16), [[DEF]](s16)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s16>) = G_BUILD_VECTOR [[FPTRUNC]](s16), [[FPTRUNC2]](s16), [[DEF]](s16), [[DEF]](s16)
+ ; CHECK-NEXT: $d0 = COPY [[BUILD_VECTOR]](<4 x s16>)
+ ; CHECK-NEXT: $d1 = COPY [[BUILD_VECTOR1]](<4 x s16>)
+ ; CHECK-NEXT: RET_ReallyLR implicit $d0, implicit $d1
+ %1:_(<4 x s16>) = COPY $d0
+ %0:_(<2 x s16>), %2:_(<2 x s16>) = G_UNMERGE_VALUES %1(<4 x s16>)
+ %3:_(<2 x s16>), %4:_(<2 x s16>) = G_FMODF %0
+ %5:_(s16), %6:_(s16) = G_UNMERGE_VALUES %3(<2 x s16>)
+ %7:_(s16) = G_IMPLICIT_DEF
+ %8:_(<4 x s16>) = G_BUILD_VECTOR %5(s16), %6(s16), %7(s16), %7(s16)
+ %9:_(s16), %10:_(s16) = G_UNMERGE_VALUES %4(<2 x s16>)
+ %11:_(<4 x s16>) = G_BUILD_VECTOR %9(s16), %10(s16), %7(s16), %7(s16)
+ $d0 = COPY %8(<4 x s16>)
+ $d1 = COPY %11(<4 x s16>)
+ RET_ReallyLR implicit $d0, implicit $d1
+...
+---
+name: test_modf_v3f32
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: test_modf_v3f32
+ ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+ ; CHECK-NEXT: [[BITCAST:%[0-9]+]]:_(<4 x s32>) = G_BITCAST [[COPY]](<2 x s64>)
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s32), [[UV1:%[0-9]+]]:_(s32), [[UV2:%[0-9]+]]:_(s32), [[UV3:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[BITCAST]](<4 x s32>)
+ ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.2
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: $s0 = COPY [[UV]](s32)
+ ; CHECK-NEXT: $x0 = COPY [[FRAME_INDEX]](p0)
+ ; CHECK-NEXT: BL &modff, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $s0, implicit $x0, implicit-def $s0
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s32) = COPY $s0
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s32) from %stack.2)
+ ; CHECK-NEXT: [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.1
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: $s0 = COPY [[UV1]](s32)
+ ; CHECK-NEXT: $x0 = COPY [[FRAME_INDEX1]](p0)
+ ; CHECK-NEXT: BL &modff, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $s0, implicit $x0, implicit-def $s0
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $s0
+ ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX1]](p0) :: (load (s32) from %stack.1)
+ ; CHECK-NEXT: [[FRAME_INDEX2:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: $s0 = COPY [[UV2]](s32)
+ ; CHECK-NEXT: $x0 = COPY [[FRAME_INDEX2]](p0)
+ ; CHECK-NEXT: BL &modff, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $s0, implicit $x0, implicit-def $s0
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY $s0
+ ; CHECK-NEXT: [[LOAD2:%[0-9]+]]:_(s32) = G_LOAD [[FRAME_INDEX2]](p0) :: (load (s32) from %stack.0)
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:_(s32) = G_IMPLICIT_DEF
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[COPY1]](s32), [[COPY2]](s32), [[COPY3]](s32), [[DEF]](s32)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[LOAD]](s32), [[LOAD1]](s32), [[LOAD2]](s32), [[DEF]](s32)
+ ; CHECK-NEXT: $q0 = COPY [[BUILD_VECTOR]](<4 x s32>)
+ ; CHECK-NEXT: $q1 = COPY [[BUILD_VECTOR1]](<4 x s32>)
+ ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
+ %1:_(<2 x s64>) = COPY $q0
+ %2:_(<4 x s32>) = G_BITCAST %1(<2 x s64>)
+ %3:_(s32), %4:_(s32), %5:_(s32), %6:_(s32) = G_UNMERGE_VALUES %2(<4 x s32>)
+ %0:_(<3 x s32>) = G_BUILD_VECTOR %3(s32), %4(s32), %5(s32)
+ %7:_(<3 x s32>), %8:_(<3 x s32>) = G_FMODF %0
+ %9:_(s32), %10:_(s32), %11:_(s32) = G_UNMERGE_VALUES %7(<3 x s32>)
+ %12:_(s32) = G_IMPLICIT_DEF
+ %13:_(<4 x s32>) = G_BUILD_VECTOR %9(s32), %10(s32), %11(s32), %12(s32)
+ %14:_(s32), %15:_(s32), %16:_(s32) = G_UNMERGE_VALUES %8(<3 x s32>)
+ %17:_(<4 x s32>) = G_BUILD_VECTOR %14(s32), %15(s32), %16(s32), %12(s32)
+ $q0 = COPY %13(<4 x s32>)
+ $q1 = COPY %17(<4 x s32>)
+ RET_ReallyLR implicit $q0, implicit $q1
+...
+---
+name: test_modf_v2f64
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: test_modf_v2f64
+ ; CHECK: [[COPY:%[0-9]+]]:_(<2 x s64>) = COPY $q0
+ ; CHECK-NEXT: [[UV:%[0-9]+]]:_(s64), [[UV1:%[0-9]+]]:_(s64) = G_UNMERGE_VALUES [[COPY]](<2 x s64>)
+ ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.1
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: $d0 = COPY [[UV]](s64)
+ ; CHECK-NEXT: $x0 = COPY [[FRAME_INDEX]](p0)
+ ; CHECK-NEXT: BL &modf, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $x0, implicit-def $d0
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s64) = COPY $d0
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s64) from %stack.1)
+ ; CHECK-NEXT: [[FRAME_INDEX1:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: $d0 = COPY [[UV1]](s64)
+ ; CHECK-NEXT: $x0 = COPY [[FRAME_INDEX1]](p0)
+ ; CHECK-NEXT: BL &modf, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $x0, implicit-def $d0
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $d0
+ ; CHECK-NEXT: [[LOAD1:%[0-9]+]]:_(s64) = G_LOAD [[FRAME_INDEX1]](p0) :: (load (s64) from %stack.0)
+ ; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[COPY1]](s64), [[COPY2]](s64)
+ ; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[LOAD]](s64), [[LOAD1]](s64)
+ ; CHECK-NEXT: $q0 = COPY [[BUILD_VECTOR]](<2 x s64>)
+ ; CHECK-NEXT: $q1 = COPY [[BUILD_VECTOR1]](<2 x s64>)
+ ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
+ %0:_(<2 x s64>) = COPY $q0
+ %1:_(<2 x s64>), %2:_(<2 x s64>) = G_FMODF %0
+ $q0 = COPY %1(<2 x s64>)
+ $q1 = COPY %2(<2 x s64>)
+ RET_ReallyLR implicit $q0, implicit $q1
+...
+---
+name: test_modf_fp128
+body: |
+ bb.0.entry:
+ ; CHECK-LABEL: name: test_modf_fp128
+ ; CHECK: [[COPY:%[0-9]+]]:_(s128) = COPY $q0
+ ; CHECK-NEXT: [[FRAME_INDEX:%[0-9]+]]:_(p0) = G_FRAME_INDEX %stack.0
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: $q0 = COPY [[COPY]](s128)
+ ; CHECK-NEXT: $x0 = COPY [[FRAME_INDEX]](p0)
+ ; CHECK-NEXT: BL &modfl, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $q0, implicit $x0, implicit-def $q0
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:_(s128) = COPY $q0
+ ; CHECK-NEXT: [[LOAD:%[0-9]+]]:_(s128) = G_LOAD [[FRAME_INDEX]](p0) :: (load (s128) from %stack.0)
+ ; CHECK-NEXT: $q0 = COPY [[COPY1]](s128)
+ ; CHECK-NEXT: $q1 = COPY [[LOAD]](s128)
+ ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
+ %0:_(s128) = COPY $q0
+ %1:_(s128), %2:_(s128) = G_FMODF %0
+ $q0 = COPY %1(s128)
+ $q1 = COPY %2(s128)
+ RET_ReallyLR implicit $q0, implicit $q1
+...
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index ba867f4..d721b73c 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -508,6 +508,10 @@
# DEBUG-NEXT: G_FREM (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. the first uncovered type index: 1, OK
# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: G_FMODF (opcode {{[0-9]+}}): 1 type index, 0 imm indices
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_FPOW (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
# DEBUG-NEXT: .. the first uncovered type index: 1, OK
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/select-modf.mir b/llvm/test/CodeGen/AArch64/GlobalISel/select-modf.mir
new file mode 100644
index 0000000..604cb96
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/select-modf.mir
@@ -0,0 +1,136 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+# RUN: llc -verify-machineinstrs -mtriple aarch64-unknown-unknown -run-pass=instruction-select %s -o - | FileCheck %s
+---
+name: test_modf_fp128
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+liveins:
+ - { reg: '$q0' }
+frameInfo:
+ maxAlignment: 16
+stack:
+ - { id: 0, size: 16, alignment: 16 }
+body: |
+ bb.1:
+ liveins: $q0
+
+ ; CHECK-LABEL: name: test_modf_fp128
+ ; CHECK: liveins: $q0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr128 = COPY $q0
+ ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0, 0
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: $q0 = COPY [[COPY]]
+ ; CHECK-NEXT: $x0 = COPY [[ADDXri]]
+ ; CHECK-NEXT: BL &modfl, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $q0, implicit $x0, implicit-def $q0
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr128 = COPY $q0
+ ; CHECK-NEXT: [[LDRQui:%[0-9]+]]:fpr128 = LDRQui %stack.0, 0 :: (load (s128) from %stack.0)
+ ; CHECK-NEXT: $q0 = COPY [[COPY1]]
+ ; CHECK-NEXT: $q1 = COPY [[LDRQui]]
+ ; CHECK-NEXT: RET_ReallyLR implicit $q0, implicit $q1
+ %0:fpr(s128) = COPY $q0
+ %3:gpr(p0) = G_FRAME_INDEX %stack.0
+ ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+ $q0 = COPY %0(s128)
+ $x0 = COPY %3(p0)
+ BL &modfl, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $q0, implicit $x0, implicit-def $q0
+ ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+ %1:fpr(s128) = COPY $q0
+ %2:fpr(s128) = G_LOAD %3(p0) :: (load (s128) from %stack.0)
+ $q0 = COPY %1(s128)
+ $q1 = COPY %2(s128)
+ RET_ReallyLR implicit $q0, implicit $q1
+...
+---
+name: test_modf_double
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+liveins:
+ - { reg: '$d0' }
+frameInfo:
+ maxAlignment: 8
+stack:
+ - { id: 0, size: 8, alignment: 8 }
+machineFunctionInfo: {}
+body: |
+ bb.1:
+ liveins: $d0
+
+ ; CHECK-LABEL: name: test_modf_double
+ ; CHECK: liveins: $d0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
+ ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0, 0
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: $d0 = COPY [[COPY]]
+ ; CHECK-NEXT: $x0 = COPY [[ADDXri]]
+ ; CHECK-NEXT: BL &modf, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $x0, implicit-def $d0
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr64 = COPY $d0
+ ; CHECK-NEXT: [[LDRDui:%[0-9]+]]:fpr64 = LDRDui %stack.0, 0 :: (load (s64) from %stack.0)
+ ; CHECK-NEXT: $d0 = COPY [[COPY1]]
+ ; CHECK-NEXT: $d1 = COPY [[LDRDui]]
+ ; CHECK-NEXT: RET_ReallyLR implicit $d0, implicit $d1
+ %0:fpr(s64) = COPY $d0
+ %3:gpr(p0) = G_FRAME_INDEX %stack.0
+ ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+ $d0 = COPY %0(s64)
+ $x0 = COPY %3(p0)
+ BL &modf, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $x0, implicit-def $d0
+ ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+ %1:fpr(s64) = COPY $d0
+ %2:fpr(s64) = G_LOAD %3(p0) :: (load (s64) from %stack.0)
+ $d0 = COPY %1(s64)
+ $d1 = COPY %2(s64)
+ RET_ReallyLR implicit $d0, implicit $d1
+...
+---
+name: test_modf_double_vec
+alignment: 4
+legalized: true
+regBankSelected: true
+tracksRegLiveness: true
+liveins:
+ - { reg: '$d0' }
+frameInfo:
+ maxAlignment: 8
+stack:
+ - { id: 0, size: 8, alignment: 8 }
+machineFunctionInfo: {}
+body: |
+ bb.1:
+ liveins: $d0
+
+ ; CHECK-LABEL: name: test_modf_double_vec
+ ; CHECK: liveins: $d0
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:fpr64 = COPY $d0
+ ; CHECK-NEXT: [[ADDXri:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0, 0
+ ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: $d0 = COPY [[COPY]]
+ ; CHECK-NEXT: $x0 = COPY [[ADDXri]]
+ ; CHECK-NEXT: BL &modf, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $x0, implicit-def $d0
+ ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:fpr64 = COPY $d0
+ ; CHECK-NEXT: [[LDRDui:%[0-9]+]]:fpr64 = LDRDui %stack.0, 0 :: (load (s64) from %stack.0)
+ ; CHECK-NEXT: $d0 = COPY [[COPY1]]
+ ; CHECK-NEXT: $d1 = COPY [[LDRDui]]
+ ; CHECK-NEXT: RET_ReallyLR implicit $d0, implicit $d1
+ %0:fpr(s64) = COPY $d0
+ %3:gpr(p0) = G_FRAME_INDEX %stack.0
+ ADJCALLSTACKDOWN 0, 0, implicit-def $sp, implicit $sp
+ $d0 = COPY %0(s64)
+ $x0 = COPY %3(p0)
+ BL &modf, csr_aarch64_aapcs, implicit-def $lr, implicit $sp, implicit $d0, implicit $x0, implicit-def $d0
+ ADJCALLSTACKUP 0, 0, implicit-def $sp, implicit $sp
+ %1:fpr(s64) = COPY $d0
+ %2:fpr(s64) = G_LOAD %3(p0) :: (load (s64) from %stack.0)
+ $d0 = COPY %1(s64)
+ $d1 = COPY %2(s64)
+ RET_ReallyLR implicit $d0, implicit $d1
+...
diff --git a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir
index aca2816..7fd0cee 100644
--- a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir
+++ b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-declare.mir
@@ -164,10 +164,10 @@ stack:
- { id: 1, name: z1.addr, size: 16, alignment: 16, stack-id: scalable-vector,
debug-info-variable: '!31', debug-info-expression: '!DIExpression()',
debug-info-location: '!32' }
- - { id: 2, name: p0.addr, size: 2, alignment: 2, stack-id: scalable-vector,
+ - { id: 2, name: p0.addr, size: 2, alignment: 2, stack-id: scalable-predicate-vector,
debug-info-variable: '!33', debug-info-expression: '!DIExpression()',
debug-info-location: '!34' }
- - { id: 3, name: p1.addr, size: 2, alignment: 2, stack-id: scalable-vector,
+ - { id: 3, name: p1.addr, size: 2, alignment: 2, stack-id: scalable-predicate-vector,
debug-info-variable: '!35', debug-info-expression: '!DIExpression()',
debug-info-location: '!36' }
- { id: 4, name: w0.addr, size: 4, alignment: 4, local-offset: -4, debug-info-variable: '!37',
@@ -181,10 +181,10 @@ stack:
- { id: 7, name: localv1, size: 16, alignment: 16, stack-id: scalable-vector,
debug-info-variable: '!45', debug-info-expression: '!DIExpression()',
debug-info-location: '!46' }
- - { id: 8, name: localp0, size: 2, alignment: 2, stack-id: scalable-vector,
+ - { id: 8, name: localp0, size: 2, alignment: 2, stack-id: scalable-predicate-vector,
debug-info-variable: '!48', debug-info-expression: '!DIExpression()',
debug-info-location: '!49' }
- - { id: 9, name: localp1, size: 2, alignment: 2, stack-id: scalable-vector,
+ - { id: 9, name: localp1, size: 2, alignment: 2, stack-id: scalable-predicate-vector,
debug-info-variable: '!51', debug-info-expression: '!DIExpression()',
debug-info-location: '!52' }
machineFunctionInfo: {}
diff --git a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir
index 0ea180b..41ba554 100644
--- a/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir
+++ b/llvm/test/CodeGen/AArch64/debug-info-sve-dbg-value.mir
@@ -96,8 +96,8 @@ stack:
- { id: 1, size: 8, alignment: 8 }
- { id: 2, size: 16, alignment: 16, stack-id: scalable-vector }
- { id: 3, size: 16, alignment: 16, stack-id: scalable-vector }
- - { id: 4, size: 2, alignment: 2, stack-id: scalable-vector }
- - { id: 5, size: 2, alignment: 2, stack-id: scalable-vector }
+ - { id: 4, size: 2, alignment: 2, stack-id: scalable-predicate-vector }
+ - { id: 5, size: 2, alignment: 2, stack-id: scalable-predicate-vector }
machineFunctionInfo: {}
body: |
bb.0.entry:
diff --git a/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
new file mode 100644
index 0000000..35eafe8
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/framelayout-split-sve.mir
@@ -0,0 +1,587 @@
+# RUN: llc -mattr=+sve -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -mtriple=aarch64-none-linux-gnu -run-pass=prologepilog %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -start-before=prologepilog %s -o - | FileCheck %s --check-prefix=ASM
+# RUN: llc -mtriple=aarch64-none-linux-gnu -mattr=+sve -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -start-before=prologepilog %s -filetype=obj -o %t
+# RUN: llvm-objdump --dwarf=frames %t | FileCheck %s --check-prefix=UNWINDINFO
+# RUN: rm -rf %t
+#
+# Test allocation and deallocation of SVE objects on the stack with
+# split-sve-objects (and hazard padding) enabled. This also tests using a
+# combination of scalable and non-scalable offsets to access the SVE on the
+# stack.
+#
+# With split-sve-objects (which implies hazard padding) the SVE area is split
+# into PPR and ZPR areas with (fixed-size) hazard padding between them. The PPR
+# area holds all scalable predicate callee saves and locals, and the ZPR area
+# holds all scalable vector callee saves and locals. Additionally, any FPR
+# callee save is promoted to a ZPR callee save (to avoid needing additional
+# hazard padding in the callee save area).
+#
+# +-------------+
+# | stack arg |
+# +-------------+ <- SP before call
+# | Callee Saves|
+# | Frame record| (if available)
+# |-------------| <- FP (if available)
+# | PPR area |
+# |-------------|
+# |/////////////| hazard padding
+# |-------------|
+# | ZPR area |
+# +-------------+
+# | : |
+# | Stack objs |
+# | : |
+# +-------------+ <- SP after call and frame-setup
+#
+--- |
+
+ define void @test_allocate_split_sve() uwtable { entry: unreachable }
+ define void @test_allocate_split_sve_realigned() uwtable { entry: unreachable }
+ define void @test_address_split_sve() uwtable { entry: unreachable }
+ define void @test_address_split_sve_fp() uwtable { entry: unreachable }
+ define aarch64_sve_vector_pcs void @save_restore_ppr_zpr() uwtable { entry: unreachable }
+
+...
+---
+# +----------+
+# |scratchreg| // x29 is used as scratch reg.
+# |----------|
+# | %stack.0 | // scalable predicate of n * 12 bytes, aligned to 16 bytes
+# | | // to be materialized with 1*ADDVL (<=> n * 16 bytes)
+# |----------|
+# |//////////| // hazard padding (1024 bytes) -- part of PPR locals area
+# |//////////| // Note: This is currently not included in the "stackSize"
+# +----------+
+# | %stack.0 | // scalable SVE object of n * 18 bytes, aligned to 16 bytes,
+# | | // to be materialized with 2*ADDVL (<=> 2 * n * 16 bytes)
+# +----------+
+# |//////////| // hazard padding (1024 bytes)
+# |----------|
+# | %stack.1 | // not scalable
+# +----------+ <- SP
+
+# CHECK-LABEL: name: test_allocate_split_sve
+# CHECK: stackSize: 1056
+
+# CHECK: bb.0.entry:
+# CHECK: liveins: $z0, $p0, $fp
+# CHECK: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.4)
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22
+#
+# CHECK-NEXT: $x8 = ADDXri $sp, 1040, 0
+# CHECK-NEXT: $x8 = ADDPL_XXI $x8, 7, implicit $vg
+# CHECK-NEXT: STR_ZXI $z0, killed $x8, 0 :: (store (<vscale x 1 x s128>) into %stack.0)
+# CHECK-NEXT: $x8 = ADDXri $sp, 2064, 0
+# CHECK-NEXT: STR_PXI $p0, killed $x8, 18 :: (store (<vscale x 1 x s16>) into %stack.1)
+#
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1056
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.4)
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
+# CHECK-NEXT: RET_ReallyLR
+
+# ASM-LABEL: test_allocate_split_sve:
+# ASM: str x29, [sp, #-16]!
+# ASM-NEXT: .cfi_def_cfa_offset 16
+# ASM-NEXT: .cfi_offset w29, -16
+# ASM-NEXT: sub sp, sp, #1024
+# ASM-NEXT: .cfi_def_cfa_offset 1040
+# ASM-NEXT: addvl sp, sp, #-1
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG
+# ASM-NEXT: sub sp, sp, #1040
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+# ASM-NEXT: addvl sp, sp, #-2
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 2080 + 24 * VG
+#
+# ASM: addvl sp, sp, #2
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+# ASM-NEXT: add sp, sp, #1024
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1056 + 8 * VG
+# ASM-NEXT: addvl sp, sp, #1
+# ASM-NEXT: .cfi_def_cfa wsp, 1056
+# ASM-NEXT: add sp, sp, #1040
+# ASM-NEXT: .cfi_def_cfa_offset 16
+# ASM-NEXT: ldr x29, [sp], #16
+# ASM-NEXT: .cfi_def_cfa_offset 0
+# ASM-NEXT: .cfi_restore w29
+
+# UNWINDINFO: DW_CFA_def_cfa_offset: +16
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
+# UNWINDINFO: DW_CFA_def_cfa_offset: +1040
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus
+#
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1056, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa: reg31 +1056
+# UNWINDINFO: DW_CFA_def_cfa_offset: +16
+# UNWINDINFO: DW_CFA_def_cfa_offset: +0
+# UNWINDINFO-NEXT: DW_CFA_restore: reg29
+
+name: test_allocate_split_sve
+stack:
+ - { id: 0, stack-id: scalable-vector, size: 18, alignment: 2 }
+ - { id: 1, stack-id: scalable-vector, size: 12, alignment: 2 }
+ - { id: 2, stack-id: default, size: 16, alignment: 8 }
+body: |
+ bb.0.entry:
+ liveins: $z0, $p0
+ STR_ZXI $z0, %stack.0, 0 :: (store (<vscale x 1 x s128>) into %stack.0)
+ STR_PXI $p0, %stack.1, 0 :: (store (<vscale x 1 x s16>) into %stack.1)
+ RET_ReallyLR
+...
+---
+
+# Stack realignment is not supported with split-sve-objects, so we fallback to
+# the default hazard padding implementation. This does not prevent hazards
+# between ZPRs and PPRs (TODO: support this case).
+#
+# +----------+
+# | lr, fp | // frame record
+# |----------|
+# |//////////| // hazard padding (1024 bytes)
+# |----------|
+# | %stack.0 | // scalable predicate of n * 12 bytes, aligned to 16 bytes
+# | | // to be materialized with 1*ADDVL (<=> n * 16 bytes)
+# +----------+
+# | %stack.0 | // scalable SVE object of n * 18 bytes, aligned to 16 bytes,
+# | | // to be materialized with 2*ADDVL (<=> 2 * n * 16 bytes)
+# +----------+
+# |//////////| // hazard padding (1024 bytes)
+# |----------|
+# | %stack.1 | // not scalable
+# +----------+ <- SP
+
+name: test_allocate_split_sve_realigned
+stack:
+ - { id: 0, stack-id: scalable-vector, size: 18, alignment: 2 }
+ - { id: 1, stack-id: scalable-vector, size: 12, alignment: 2 }
+ - { id: 2, stack-id: default, size: 16, alignment: 32 }
+body: |
+ bb.0.entry:
+ liveins: $z0, $p0
+ STR_ZXI $z0, %stack.0, 0 :: (store (<vscale x 1 x s128>) into %stack.0)
+ STR_PXI $p0, %stack.1, 0 :: (store (<vscale x 1 x s16>) into %stack.1)
+ RET_ReallyLR
+
+# CHECK-LABEL: name: test_allocate_split_sve_realigned
+# CHECK: stackSize: 2080
+
+# CHECK: bb.0.entry:
+# CHECK: liveins: $z0, $p0, $lr
+# CHECK: $sp = frame-setup SUBXri $sp, 1040, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040
+# CHECK-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.5)
+# CHECK-NEXT: frame-setup STRXui killed $lr, $sp, 129 :: (store (s64) into %stack.4)
+# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 1024, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = frame-setup SUBXri $sp, 1040, 0
+# CHECK-NEXT: $[[TMP]] = frame-setup ADDVL_XXI $[[TMP]], -2, implicit $vg
+# CHECK-NEXT: $sp = frame-setup ANDXri killed $x9, 7930
+#
+# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0
+# CHECK-NEXT: $x8 = ADDPL_XXI $x8, -1, implicit $vg
+# CHECK-NEXT: STR_ZXI $z0, killed $x8, -1 :: (store (<vscale x 1 x s128>) into %stack.0)
+# CHECK-NEXT: $x8 = SUBXri $fp, 1024, 0
+# CHECK-NEXT: STR_PXI $p0, killed $x8, -15 :: (store (<vscale x 1 x s16>) into %stack.1)
+#
+# CHECK-NEXT: $sp = frame-destroy SUBXri $fp, 1024, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1040
+# CHECK-NEXT: $lr = frame-destroy LDRXui $sp, 129 :: (load (s64) from %stack.4)
+# CHECK-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.5)
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
+# CHECK-NEXT: RET_ReallyLR
+
+# ASM-LABEL: test_allocate_split_sve_realigned
+# ASM: sub sp, sp, #1040
+# ASM-NEXT: .cfi_def_cfa_offset 1040
+# ASM-NEXT: str x29, [sp, #1024]
+# ASM-NEXT: str x30, [sp, #1032]
+# ASM-NEXT: add x29, sp, #1024
+# ASM-NEXT: .cfi_def_cfa w29, 16
+# ASM-NEXT: .cfi_offset w30, -8
+# ASM-NEXT: .cfi_offset w29, -16
+#
+# ASM: sub sp, x29, #1024
+# ASM-NEXT: .cfi_def_cfa wsp, 1040
+# ASM-NEXT: ldr x30, [sp, #1032]
+# ASM-NEXT: ldr x29, [sp, #1024]
+# ASM-NEXT: add sp, sp, #1040
+# ASM-NEXT: .cfi_def_cfa_offset 0
+# ASM-NEXT: .cfi_restore w30
+# ASM-NEXT: .cfi_restore w29
+
+# UNWINDINFO: DW_CFA_def_cfa_offset: +1040
+# UNWINDINFO: DW_CFA_def_cfa: reg29 +16
+# UNWINDINFO-NEXT: DW_CFA_offset: reg30 -8
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
+#
+# UNWINDINFO: DW_CFA_def_cfa: reg31 +1040
+# UNWINDINFO: DW_CFA_def_cfa_offset: +0
+# UNWINDINFO-NEXT: DW_CFA_restore: reg30
+# UNWINDINFO-NEXT: DW_CFA_restore: reg29
+...
+---
+
+# +----------+
+# |scratchreg| // x29 is used as scratch reg.
+# +----------+
+# | %stack.2 | // scalable predicate @ SP + 2064b + 46 scalable bytes
+# |----------|
+# |//////////| // hazard padding (1024 bytes) -- part of PPR locals area
+# |//////////| // Note: This is currently not included in the "stackSize"
+# |----------|
+# | %stack.0 | // scalable vector @ SP + 1040b + 16 scalable bytes
+# | %stack.1 | // scalable vector @ SP + 1040b
+# +----------+
+# |//////////| // hazard padding (1024 bytes)
+# |----------|
+# | %stack.3 | // not scalable
+# +----------+ <- SP
+
+# CHECK-LABEL: name: test_address_split_sve
+# CHECK: stackSize: 1056
+
+# CHECK: bb.0.entry:
+# CHECK-NEXT: liveins:
+# CHECK-NEXT: {{ $}}
+# CHECK-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.5)
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 1040
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22
+#
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 1040, 0
+# CHECK-NEXT: STR_ZXI $z0, killed $[[TMP]], 1
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 1040, 0
+# CHECK-NEXT: STR_ZXI $z1, killed $[[TMP]], 0
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = ADDXri $sp, 2064, 0
+# CHECK-NEXT: STR_PXI $p0, killed $[[TMP]], 23
+#
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 1056
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.5)
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
+# CHECK-NEXT: RET_ReallyLR
+
+# ASM-LABEL: test_address_split_sve
+# ASM: str x29, [sp, #-16]!
+# ASM-NEXT: .cfi_def_cfa_offset 16
+# ASM-NEXT: .cfi_offset w29, -16
+# ASM-NEXT: sub sp, sp, #1024
+# ASM-NEXT: .cfi_def_cfa_offset 1040
+# ASM-NEXT: addvl sp, sp, #-1
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG
+# ASM-NEXT: sub sp, sp, #1040
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+# ASM-NEXT: addvl sp, sp, #-2
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x48, 0x1e, 0x22 // sp + 2080 + 24 * VG
+#
+# ASM: addvl sp, sp, #2
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+# ASM-NEXT: add sp, sp, #1024
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1056 + 8 * VG
+# ASM-NEXT: addvl sp, sp, #1
+# ASM-NEXT: .cfi_def_cfa wsp, 1056
+# ASM-NEXT: add sp, sp, #1040
+# ASM-NEXT: .cfi_def_cfa_offset 16
+# ASM-NEXT: ldr x29, [sp], #16
+# ASM-NEXT: .cfi_def_cfa_offset 0
+# ASM-NEXT: .cfi_restore w29
+
+# UNWINDINFO: DW_CFA_def_cfa_offset: +16
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
+# UNWINDINFO: DW_CFA_def_cfa_offset: +1040
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit24, DW_OP_mul, DW_OP_plus
+#
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2080, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1056, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa: reg31 +1056
+# UNWINDINFO: DW_CFA_def_cfa_offset: +16
+# UNWINDINFO: DW_CFA_def_cfa_offset: +0
+# UNWINDINFO-NEXT: DW_CFA_restore: reg29
+
+name: test_address_split_sve
+frameInfo:
+ maxAlignment: 16
+stack:
+ - { id: 0, stack-id: scalable-vector, size: 16, alignment: 8 }
+ - { id: 1, stack-id: scalable-vector, size: 16, alignment: 8 }
+ - { id: 2, stack-id: scalable-vector, size: 2, alignment: 2 }
+ - { id: 3, stack-id: default, size: 16, alignment: 8 }
+body: |
+ bb.0.entry:
+ liveins: $z0, $z1, $p0
+
+ STR_ZXI $z0, %stack.0, 0 :: (store (<vscale x 1 x s128>) into %stack.0)
+ STR_ZXI $z1, %stack.1, 0 :: (store (<vscale x 1 x s128>) into %stack.1)
+ STR_PXI $p0, %stack.2, 0 :: (store (<vscale x 1 x s16>) into %stack.2)
+
+ RET_ReallyLR
+...
+---
+# +----------+
+# | lr, fp | // frame record
+# +----------+ <- FP
+# | %stack.2 | // scalable predicate @ FP - 2 scalable bytes
+# |----------|
+# |//////////| // hazard padding (1024 bytes) -- part of PPR locals area
+# |//////////| // Note: This is currently not included in the "stackSize"
+# |----------|
+# | %stack.0 | // scalable vector @ FP - 1024b - 32 scalable bytes
+# | %stack.1 | // scalable vector @ FP - 1024b - 48 scalable bytes
+# +----------+
+# |//////////| // hazard padding (1024 bytes)
+# |----------|
+# | %stack.3 | // not scalable
+# +----------+ <- SP
+
+# CHECK-LABEL: name: test_address_split_sve_fp
+# CHECK: stackSize: 1056
+#
+# CHECK: bb.0.entry:
+# CHECK-NEXT: liveins:
+# CHECK-NEXT: {{ $}}
+# CHECK-NEXT: early-clobber $sp = frame-setup STPXpre killed $fp, killed $lr, $sp, -2 :: (store (s64) into %stack.6), (store (s64) into %stack.5)
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: $fp = frame-setup ADDXri $sp, 0, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa $w29, 16
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w30, -8
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
+#
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = SUBXri $fp, 1024, 0
+# CHECK-NEXT: STR_ZXI $z0, killed $[[TMP]], -2
+# CHECK-NEXT: $[[TMP:x[0-9]+]] = SUBXri $fp, 1024, 0
+# CHECK-NEXT: STR_ZXI $z1, killed $[[TMP]], -3
+# CHECK-NEXT: STR_PXI $p0, $fp, -1
+#
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
+# CHECK-NEXT: early-clobber $sp, $fp, $lr = frame-destroy LDPXpost $sp, 2 :: (load (s64) from %stack.6), (load (s64) from %stack.5)
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w30
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
+# CHECK-NEXT: RET_ReallyLR
+
+# ASM-LABEL: test_address_split_sve_fp
+# ASM: stp x29, x30, [sp, #-16]!
+# ASM-NEXT: .cfi_def_cfa_offset 16
+# ASM-NEXT: mov x29, sp
+# ASM-NEXT: .cfi_def_cfa w29, 16
+# ASM-NEXT: .cfi_offset w30, -8
+# ASM-NEXT: .cfi_offset w29, -16
+# ASM-NEXT: sub sp, sp, #1024
+# ASM-NEXT: addvl sp, sp, #-1
+# ASM-NEXT: sub sp, sp, #1040
+# ASM-NEXT: addvl sp, sp, #-2
+#
+# ASM: addvl sp, sp, #2
+# ASM-NEXT: add sp, sp, #1024
+# ASM-NEXT: addvl sp, sp, #1
+# ASM-NEXT: add sp, sp, #1040
+# ASM-NEXT: .cfi_def_cfa wsp, 16
+# ASM-NEXT: ldp x29, x30, [sp], #16
+# ASM-NEXT: .cfi_def_cfa_offset 0
+# ASM-NEXT: .cfi_restore w30
+# ASM-NEXT: .cfi_restore w29
+
+# UNWINDINFO: DW_CFA_def_cfa_offset: +16
+# UNWINDINFO: DW_CFA_def_cfa: reg29 +16
+# UNWINDINFO-NEXT: DW_CFA_offset: reg30 -8
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
+#
+# UNWINDINFO: DW_CFA_def_cfa: reg31 +16
+# UNWINDINFO: DW_CFA_def_cfa_offset: +0
+# UNWINDINFO-NEXT: DW_CFA_restore: reg30
+# UNWINDINFO-NEXT: DW_CFA_restore: reg29
+
+name: test_address_split_sve_fp
+frameInfo:
+ maxAlignment: 16
+ isFrameAddressTaken: true
+stack:
+ - { id: 0, stack-id: scalable-vector, size: 16, alignment: 8 }
+ - { id: 1, stack-id: scalable-vector, size: 16, alignment: 8 }
+ - { id: 2, stack-id: scalable-vector, size: 2, alignment: 2 }
+ - { id: 3, stack-id: default, size: 16, alignment: 8 }
+body: |
+ bb.0.entry:
+ liveins: $z0, $z1, $p0
+
+ STR_ZXI $z0, %stack.0, 0 :: (store (<vscale x 1 x s128>) into %stack.0)
+ STR_ZXI $z1, %stack.1, 0 :: (store (<vscale x 1 x s128>) into %stack.1)
+ STR_PXI $p0, %stack.2, 0 :: (store (<vscale x 1 x s16>) into %stack.2)
+
+ RET_ReallyLR
+...
+---
+# CHECK-LABEL: name: save_restore_ppr_zpr
+# CHECK: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.8)
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION def_cfa_offset 16
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION offset $w29, -16
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -1, implicit $vg
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: frame-setup STR_PXI killed $p6, $sp, 5 :: (store (s16) into %stack.7)
+# CHECK-NEXT: frame-setup STR_PXI killed $p5, $sp, 6 :: (store (s16) into %stack.6)
+# CHECK-NEXT: frame-setup STR_PXI killed $p4, $sp, 7 :: (store (s16) into %stack.5)
+#
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+#
+# CHECK-NEXT: $sp = frame-setup ADDVL_XXI $sp, -3, implicit $vg
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0a, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22
+# CHECK-NEXT: frame-setup STR_ZXI killed $z10, $sp, 0 :: (store (s128) into %stack.4)
+# CHECK-NEXT: frame-setup STR_ZXI killed $z9, $sp, 1 :: (store (s128) into %stack.3)
+# CHECK-NEXT: frame-setup STR_ZXI killed $z8, $sp, 2 :: (store (s128) into %stack.2)
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22
+# CHECK-NEXT: $sp = frame-setup SUBXri $sp, 1056, 0
+# CHECK-NEXT: frame-setup CFI_INSTRUCTION escape 0x0f, 0x0a, 0x8f, 0xb0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22
+#
+#
+# CHECK: $sp = frame-destroy ADDXri $sp, 1056, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x0a, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22
+# CHECK-NEXT: $z10 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.4)
+# CHECK-NEXT: $z9 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.3)
+# CHECK-NEXT: $z8 = frame-destroy LDR_ZXI $sp, 2 :: (load (s128) from %stack.2)
+#
+# CHECK-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x09, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22
+#
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 3, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z8
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z9
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $z10
+# CHECK-NEXT: $p6 = frame-destroy LDR_PXI $sp, 5 :: (load (s16) from %stack.7)
+# CHECK-NEXT: $p5 = frame-destroy LDR_PXI $sp, 6 :: (load (s16) from %stack.6)
+# CHECK-NEXT: $p4 = frame-destroy LDR_PXI $sp, 7 :: (load (s16) from %stack.5)
+# CHECK-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 1, implicit $vg
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa $wsp, 16
+# CHECK-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.8)
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION def_cfa_offset 0
+# CHECK-NEXT: frame-destroy CFI_INSTRUCTION restore $w29
+# CHECK-NEXT: RET_ReallyLR
+
+# ASM-LABEL: save_restore_ppr_zpr:
+# ASM: str x29, [sp, #-16]!
+# ASM-NEXT: .cfi_def_cfa_offset 16
+# ASM-NEXT: .cfi_offset w29, -16
+# ASM-NEXT: addvl sp, sp, #-1
+# ASM-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
+# ASM-NEXT: str p6, [sp, #5, mul vl]
+# ASM-NEXT: str p5, [sp, #6, mul vl]
+# ASM-NEXT: str p4, [sp, #7, mul vl]
+# ASM-NEXT: sub sp, sp, #1024
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 1040 + 8 * VG
+# ASM-NEXT: addvl sp, sp, #-3
+# ASM-NEXT: .cfi_escape 0x0f, 0x0a, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 // sp + 1040 + 32 * VG
+# ASM-NEXT: str z10, [sp]
+# ASM-NEXT: str z9, [sp, #1, mul vl]
+# ASM-NEXT: str z8, [sp, #2, mul vl]
+# ASM-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1040
+# ASM-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d9 @ cfa - 24 * VG - 1040
+# ASM-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d10 @ cfa - 32 * VG - 1040
+# ASM-NEXT: sub sp, sp, #1056
+# ASM-NEXT: .cfi_escape 0x0f, 0x0a, 0x8f, 0xb0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 // sp + 2096 + 32 * VG
+#
+# ASM: add sp, sp, #1056
+# ASM-NEXT: .cfi_escape 0x0f, 0x0a, 0x8f, 0x90, 0x08, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 // sp + 1040 + 32 * VG
+# ASM-NEXT: ldr z10, [sp]
+# ASM-NEXT: ldr z9, [sp, #1, mul vl]
+# ASM-NEXT: ldr z8, [sp, #2, mul vl]
+# ASM-NEXT: add sp, sp, #1024
+# ASM-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x11, 0x20, 0x1e, 0x22 // sp + 16 + 32 * VG
+# ASM-NEXT: addvl sp, sp, #3
+# ASM-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 16 + 8 * VG
+# ASM-NEXT: .cfi_restore z8
+# ASM-NEXT: .cfi_restore z9
+# ASM-NEXT: .cfi_restore z10
+# ASM-NEXT: ldr p6, [sp, #5, mul vl]
+# ASM-NEXT: ldr p5, [sp, #6, mul vl]
+# ASM-NEXT: ldr p4, [sp, #7, mul vl]
+# ASM-NEXT: addvl sp, sp, #1
+# ASM-NEXT: .cfi_def_cfa wsp, 16
+# ASM-NEXT: ldr x29, [sp], #16
+# ASM-NEXT: .cfi_def_cfa_offset 0
+# ASM-NEXT: .cfi_restore w29
+
+# UNWINDINFO: DW_CFA_def_cfa_offset: +16
+# UNWINDINFO-NEXT: DW_CFA_offset: reg29 -16
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_consts +32, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_expression: reg72 DW_OP_bregx 0x2e +0, DW_OP_consts -16, DW_OP_mul, DW_OP_plus, DW_OP_consts -1040, DW_OP_plus
+# UNWINDINFO: DW_CFA_expression: reg73 DW_OP_bregx 0x2e +0, DW_OP_consts -24, DW_OP_mul, DW_OP_plus, DW_OP_consts -1040, DW_OP_plus
+# UNWINDINFO: DW_CFA_expression: reg74 DW_OP_bregx 0x2e +0, DW_OP_consts -32, DW_OP_mul, DW_OP_plus, DW_OP_consts -1040, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +2096, DW_OP_bregx 0x2e +0, DW_OP_consts +32, DW_OP_mul, DW_OP_plus
+#
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +1040, DW_OP_bregx 0x2e +0, DW_OP_consts +32, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_consts +32, DW_OP_mul, DW_OP_plus
+# UNWINDINFO: DW_CFA_def_cfa_expression: DW_OP_breg31 +16, DW_OP_bregx 0x2e +0, DW_OP_lit8, DW_OP_mul, DW_OP_plus
+# UNWINDINFO-NEXT: DW_CFA_restore_extended: reg104
+# UNWINDINFO-NEXT: DW_CFA_restore_extended: reg105
+# UNWINDINFO-NEXT: DW_CFA_restore_extended: reg106
+# UNWINDINFO: DW_CFA_def_cfa: reg31 +16
+# UNWINDINFO: DW_CFA_def_cfa_offset: +0
+# UNWINDINFO-NEXT: DW_CFA_restore: reg29
+
+name: save_restore_ppr_zpr
+stack:
+ - { id: 0, stack-id: default, size: 32, alignment: 16 }
+body: |
+ bb.0.entry:
+
+ $p4 = IMPLICIT_DEF
+ $p5 = IMPLICIT_DEF
+ $p6 = IMPLICIT_DEF
+ $z8 = IMPLICIT_DEF
+ $z9 = IMPLICIT_DEF
+ $z10 = IMPLICIT_DEF
+
+ RET_ReallyLR
diff --git a/llvm/test/CodeGen/AArch64/framelayout-sve.mir b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
index 03a6aab..1101416 100644
--- a/llvm/test/CodeGen/AArch64/framelayout-sve.mir
+++ b/llvm/test/CodeGen/AArch64/framelayout-sve.mir
@@ -1215,19 +1215,19 @@ body: |
# CHECK: - { id: 2, name: '', type: default, offset: -112, size: 16, alignment: 16,
# CHECK-NEXT: stack-id: scalable-vector,
# CHECK: - { id: 3, name: '', type: default, offset: -114, size: 2, alignment: 2,
-# CHECK-NEXT: stack-id: scalable-vector,
+# CHECK-NEXT: stack-id: scalable-predicate-vector,
# CHECK: - { id: 4, name: '', type: spill-slot, offset: -144, size: 16, alignment: 16,
# CHECK-NEXT: stack-id: scalable-vector,
# CHECK: - { id: 5, name: '', type: spill-slot, offset: -146, size: 2, alignment: 2,
-# CHECK-NEXT: stack-id: scalable-vector,
+# CHECK-NEXT: stack-id: scalable-predicate-vector,
# CHECK: - { id: 6, name: '', type: spill-slot, offset: -16, size: 16, alignment: 16,
# CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '$z8',
# CHECK: - { id: 7, name: '', type: spill-slot, offset: -32, size: 16, alignment: 16,
# CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '$z23',
# CHECK: - { id: 8, name: '', type: spill-slot, offset: -34, size: 2, alignment: 2,
-# CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '$p4',
+# CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: '$p4',
# CHECK: - { id: 9, name: '', type: spill-slot, offset: -36, size: 2, alignment: 2,
-# CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: '$p15',
+# CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: '$p15',
# CHECK: - { id: 10, name: '', type: spill-slot, offset: -16, size: 8, alignment: 16,
# CHECK-NEXT: stack-id: default, callee-saved-register: '$fp',
#
@@ -1295,9 +1295,9 @@ stack:
- { id: 0, type: default, size: 32, alignment: 16, stack-id: scalable-vector }
- { id: 1, type: default, size: 4, alignment: 2, stack-id: scalable-vector }
- { id: 2, type: default, size: 16, alignment: 16, stack-id: scalable-vector }
- - { id: 3, type: default, size: 2, alignment: 2, stack-id: scalable-vector }
+ - { id: 3, type: default, size: 2, alignment: 2, stack-id: scalable-predicate-vector }
- { id: 4, type: spill-slot, size: 16, alignment: 16, stack-id: scalable-vector }
- - { id: 5, type: spill-slot, size: 2, alignment: 2, stack-id: scalable-vector }
+ - { id: 5, type: spill-slot, size: 2, alignment: 2, stack-id: scalable-predicate-vector }
body: |
bb.0.entry:
diff --git a/llvm/test/CodeGen/AArch64/freeze.ll b/llvm/test/CodeGen/AArch64/freeze.ll
index fae3bbe..fb909fe 100644
--- a/llvm/test/CodeGen/AArch64/freeze.ll
+++ b/llvm/test/CodeGen/AArch64/freeze.ll
@@ -466,15 +466,12 @@ define <8 x i16> @freeze_urhadd(<8 x i16> %a0, <8 x i16> %a1) {
ret <8 x i16> %masked
}
-; TODO: Unnecessary sext_inreg
define <8 x i16> @freeze_shadd(<8 x i8> %a0, <8 x i16> %a1) {
; CHECK-LABEL: freeze_shadd:
; CHECK: // %bb.0:
; CHECK-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-NEXT: sshr v1.8h, v1.8h, #8
; CHECK-NEXT: shadd v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: shl v0.8h, v0.8h, #8
-; CHECK-NEXT: sshr v0.8h, v0.8h, #8
; CHECK-NEXT: ret
%x0 = sext <8 x i8> %a0 to <8 x i16>
%x1 = ashr <8 x i16> %a1, splat (i16 8)
@@ -485,15 +482,12 @@ define <8 x i16> @freeze_shadd(<8 x i8> %a0, <8 x i16> %a1) {
ret <8 x i16> %sext
}
-; TODO: Unnecessary sext_inreg
define <8 x i16> @freeze_srhadd(<8 x i8> %a0, <8 x i16> %a1) {
; CHECK-LABEL: freeze_srhadd:
; CHECK: // %bb.0:
; CHECK-NEXT: sshll v0.8h, v0.8b, #0
; CHECK-NEXT: sshr v1.8h, v1.8h, #8
; CHECK-NEXT: srhadd v0.8h, v0.8h, v1.8h
-; CHECK-NEXT: shl v0.8h, v0.8h, #8
-; CHECK-NEXT: sshr v0.8h, v0.8h, #8
; CHECK-NEXT: ret
%x0 = sext <8 x i8> %a0 to <8 x i16>
%x1 = ashr <8 x i16> %a1, splat (i16 8)
diff --git a/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll b/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll
index b89f551..e2c861b 100644
--- a/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll
+++ b/llvm/test/CodeGen/AArch64/get-active-lane-mask-extract.ll
@@ -327,9 +327,6 @@ define void @test_2x8bit_mask_with_extracts_and_ptest(i64 %i, i64 %n) {
; CHECK-SVE2p1-SME2-LABEL: test_2x8bit_mask_with_extracts_and_ptest:
; CHECK-SVE2p1-SME2: // %bb.0: // %entry
; CHECK-SVE2p1-SME2-NEXT: whilelo { p0.h, p1.h }, x0, x1
-; CHECK-SVE2p1-SME2-NEXT: ptrue p2.b
-; CHECK-SVE2p1-SME2-NEXT: uzp1 p3.b, p0.b, p1.b
-; CHECK-SVE2p1-SME2-NEXT: ptest p2, p3.b
; CHECK-SVE2p1-SME2-NEXT: b.pl .LBB11_2
; CHECK-SVE2p1-SME2-NEXT: // %bb.1: // %if.then
; CHECK-SVE2p1-SME2-NEXT: b use
@@ -368,9 +365,6 @@ define void @test_2x8bit_mask_with_extracts_and_reinterpret_casts(i64 %i, i64 %n
; CHECK-SVE2p1-SME2-LABEL: test_2x8bit_mask_with_extracts_and_reinterpret_casts:
; CHECK-SVE2p1-SME2: // %bb.0: // %entry
; CHECK-SVE2p1-SME2-NEXT: whilelo { p0.s, p1.s }, x0, x1
-; CHECK-SVE2p1-SME2-NEXT: ptrue p2.h
-; CHECK-SVE2p1-SME2-NEXT: uzp1 p3.h, p0.h, p1.h
-; CHECK-SVE2p1-SME2-NEXT: ptest p2, p3.b
; CHECK-SVE2p1-SME2-NEXT: b.pl .LBB12_2
; CHECK-SVE2p1-SME2-NEXT: // %bb.1: // %if.then
; CHECK-SVE2p1-SME2-NEXT: b use
@@ -413,14 +407,9 @@ define void @test_4x4bit_mask_with_extracts_and_ptest(i64 %i, i64 %n) {
; CHECK-SVE2p1-SME2-NEXT: adds x8, x0, x8
; CHECK-SVE2p1-SME2-NEXT: csinv x8, x8, xzr, lo
; CHECK-SVE2p1-SME2-NEXT: whilelo { p0.s, p1.s }, x0, x1
-; CHECK-SVE2p1-SME2-NEXT: whilelo { p2.s, p3.s }, x8, x1
-; CHECK-SVE2p1-SME2-NEXT: uzp1 p4.h, p0.h, p1.h
-; CHECK-SVE2p1-SME2-NEXT: uzp1 p5.h, p2.h, p3.h
-; CHECK-SVE2p1-SME2-NEXT: uzp1 p4.b, p4.b, p5.b
-; CHECK-SVE2p1-SME2-NEXT: ptrue p5.b
-; CHECK-SVE2p1-SME2-NEXT: ptest p5, p4.b
; CHECK-SVE2p1-SME2-NEXT: b.pl .LBB13_2
; CHECK-SVE2p1-SME2-NEXT: // %bb.1: // %if.then
+; CHECK-SVE2p1-SME2-NEXT: whilelo { p2.s, p3.s }, x8, x1
; CHECK-SVE2p1-SME2-NEXT: b use
; CHECK-SVE2p1-SME2-NEXT: .LBB13_2: // %if.end
; CHECK-SVE2p1-SME2-NEXT: ret
@@ -463,14 +452,9 @@ define void @test_4x2bit_mask_with_extracts_and_reinterpret_casts(i64 %i, i64 %n
; CHECK-SVE2p1-SME2-NEXT: adds x8, x0, x8
; CHECK-SVE2p1-SME2-NEXT: csinv x8, x8, xzr, lo
; CHECK-SVE2p1-SME2-NEXT: whilelo { p0.d, p1.d }, x0, x1
-; CHECK-SVE2p1-SME2-NEXT: whilelo { p2.d, p3.d }, x8, x1
-; CHECK-SVE2p1-SME2-NEXT: uzp1 p4.s, p0.s, p1.s
-; CHECK-SVE2p1-SME2-NEXT: uzp1 p5.s, p2.s, p3.s
-; CHECK-SVE2p1-SME2-NEXT: uzp1 p4.h, p4.h, p5.h
-; CHECK-SVE2p1-SME2-NEXT: ptrue p5.h
-; CHECK-SVE2p1-SME2-NEXT: ptest p5, p4.b
; CHECK-SVE2p1-SME2-NEXT: b.pl .LBB14_2
; CHECK-SVE2p1-SME2-NEXT: // %bb.1: // %if.then
+; CHECK-SVE2p1-SME2-NEXT: whilelo { p2.d, p3.d }, x8, x1
; CHECK-SVE2p1-SME2-NEXT: b use
; CHECK-SVE2p1-SME2-NEXT: .LBB14_2: // %if.end
; CHECK-SVE2p1-SME2-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/llvm.modf.ll b/llvm/test/CodeGen/AArch64/llvm.modf.ll
index 41fe796..503742f 100644
--- a/llvm/test/CodeGen/AArch64/llvm.modf.ll
+++ b/llvm/test/CodeGen/AArch64/llvm.modf.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=aarch64-gnu-linux < %s | FileCheck -check-prefixes=CHECK %s
+; RUN: llc -mtriple=aarch64-gnu-linux < %s | FileCheck -check-prefixes=CHECK,CHECK-SD %s
+; RUN: llc -mtriple=aarch64-gnu-linux -global-isel < %s | FileCheck -check-prefixes=CHECK,CHECK-GI %s
define { half, half } @test_modf_f16(half %a) {
; CHECK-LABEL: test_modf_f16:
@@ -55,61 +56,95 @@ define half @test_modf_f16_only_use_integral_part(half %a) {
}
define { <2 x half>, <2 x half> } @test_modf_v2f16(<2 x half> %a) {
-; CHECK-LABEL: test_modf_v2f16:
-; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #64
-; CHECK-NEXT: str x30, [sp, #48] // 8-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 64
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: mov h1, v0.h[1]
-; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT: add x0, sp, #44
-; CHECK-NEXT: fcvt s0, h1
-; CHECK-NEXT: bl modff
-; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload
-; CHECK-NEXT: fcvt h0, s0
-; CHECK-NEXT: add x0, sp, #40
-; CHECK-NEXT: fcvt s1, h1
-; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: fmov s0, s1
-; CHECK-NEXT: bl modff
-; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload
-; CHECK-NEXT: fcvt h2, s0
-; CHECK-NEXT: add x0, sp, #56
-; CHECK-NEXT: mov h1, v1.h[2]
-; CHECK-NEXT: fcvt s0, h1
-; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: mov v2.h[1], v1.h[0]
-; CHECK-NEXT: str q2, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: bl modff
-; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload
-; CHECK-NEXT: fcvt h2, s0
-; CHECK-NEXT: add x0, sp, #60
-; CHECK-NEXT: mov h1, v1.h[3]
-; CHECK-NEXT: fcvt s0, h1
-; CHECK-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: mov v1.h[2], v2.h[0]
-; CHECK-NEXT: str q1, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: bl modff
-; CHECK-NEXT: ldp s2, s1, [sp, #40]
-; CHECK-NEXT: fcvt h4, s0
-; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload
-; CHECK-NEXT: fcvt h3, s1
-; CHECK-NEXT: fcvt h1, s2
-; CHECK-NEXT: ldr s2, [sp, #56]
-; CHECK-NEXT: mov v0.h[3], v4.h[0]
-; CHECK-NEXT: fcvt h2, s2
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT: mov v1.h[1], v3.h[0]
-; CHECK-NEXT: ldr s3, [sp, #60]
-; CHECK-NEXT: mov v1.h[2], v2.h[0]
-; CHECK-NEXT: fcvt h2, s3
-; CHECK-NEXT: mov v1.h[3], v2.h[0]
-; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1
-; CHECK-NEXT: add sp, sp, #64
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_modf_v2f16:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: sub sp, sp, #64
+; CHECK-SD-NEXT: str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT: .cfi_offset w30, -16
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: mov h1, v0.h[1]
+; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT: add x0, sp, #44
+; CHECK-SD-NEXT: fcvt s0, h1
+; CHECK-SD-NEXT: bl modff
+; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT: fcvt h0, s0
+; CHECK-SD-NEXT: add x0, sp, #40
+; CHECK-SD-NEXT: fcvt s1, h1
+; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT: fmov s0, s1
+; CHECK-SD-NEXT: bl modff
+; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT: fcvt h2, s0
+; CHECK-SD-NEXT: add x0, sp, #56
+; CHECK-SD-NEXT: mov h1, v1.h[2]
+; CHECK-SD-NEXT: fcvt s0, h1
+; CHECK-SD-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT: mov v2.h[1], v1.h[0]
+; CHECK-SD-NEXT: str q2, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT: bl modff
+; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT: fcvt h2, s0
+; CHECK-SD-NEXT: add x0, sp, #60
+; CHECK-SD-NEXT: mov h1, v1.h[3]
+; CHECK-SD-NEXT: fcvt s0, h1
+; CHECK-SD-NEXT: ldr q1, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT: mov v1.h[2], v2.h[0]
+; CHECK-SD-NEXT: str q1, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT: bl modff
+; CHECK-SD-NEXT: ldp s2, s1, [sp, #40]
+; CHECK-SD-NEXT: fcvt h4, s0
+; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-SD-NEXT: fcvt h3, s1
+; CHECK-SD-NEXT: fcvt h1, s2
+; CHECK-SD-NEXT: ldr s2, [sp, #56]
+; CHECK-SD-NEXT: mov v0.h[3], v4.h[0]
+; CHECK-SD-NEXT: fcvt h2, s2
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT: mov v1.h[1], v3.h[0]
+; CHECK-SD-NEXT: ldr s3, [sp, #60]
+; CHECK-SD-NEXT: mov v1.h[2], v2.h[0]
+; CHECK-SD-NEXT: fcvt h2, s3
+; CHECK-SD-NEXT: mov v1.h[3], v2.h[0]
+; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT: add sp, sp, #64
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_modf_v2f16:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: sub sp, sp, #64
+; CHECK-GI-NEXT: str d8, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT: str x30, [sp, #56] // 8-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT: .cfi_offset w30, -8
+; CHECK-GI-NEXT: .cfi_offset b8, -16
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: mov h8, v0.h[1]
+; CHECK-GI-NEXT: add x0, sp, #40
+; CHECK-GI-NEXT: fcvt s0, h0
+; CHECK-GI-NEXT: bl modff
+; CHECK-GI-NEXT: fcvt h0, s0
+; CHECK-GI-NEXT: ldr s1, [sp, #40]
+; CHECK-GI-NEXT: add x0, sp, #44
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT: fcvt h0, s1
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT: fcvt s0, h8
+; CHECK-GI-NEXT: bl modff
+; CHECK-GI-NEXT: ldr s1, [sp, #44]
+; CHECK-GI-NEXT: fcvt h3, s0
+; CHECK-GI-NEXT: ldr x30, [sp, #56] // 8-byte Folded Reload
+; CHECK-GI-NEXT: ldr d8, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT: fcvt h2, s1
+; CHECK-GI-NEXT: ldp q0, q1, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT: mov v0.h[1], v3.h[0]
+; CHECK-GI-NEXT: mov v1.h[1], v2.h[0]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 killed $q1
+; CHECK-GI-NEXT: add sp, sp, #64
+; CHECK-GI-NEXT: ret
%result = call { <2 x half>, <2 x half> } @llvm.modf.v2f16(<2 x half> %a)
ret { <2 x half>, <2 x half> } %result
}
@@ -130,80 +165,156 @@ define { float, float } @test_modf_f32(float %a) {
}
define { <3 x float>, <3 x float> } @test_modf_v3f32(<3 x float> %a) {
-; CHECK-LABEL: test_modf_v3f32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #80
-; CHECK-NEXT: str x30, [sp, #48] // 8-byte Folded Spill
-; CHECK-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 80
-; CHECK-NEXT: .cfi_offset w19, -8
-; CHECK-NEXT: .cfi_offset w20, -16
-; CHECK-NEXT: .cfi_offset w30, -32
-; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: mov s0, v0.s[1]
-; CHECK-NEXT: add x0, sp, #56
-; CHECK-NEXT: add x19, sp, #56
-; CHECK-NEXT: bl modff
-; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT: add x0, sp, #44
-; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT: bl modff
-; CHECK-NEXT: ldr q1, [sp] // 16-byte Folded Reload
-; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT: add x0, sp, #60
-; CHECK-NEXT: add x20, sp, #60
-; CHECK-NEXT: mov v0.s[1], v1.s[0]
-; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: mov s0, v0.s[2]
-; CHECK-NEXT: bl modff
-; CHECK-NEXT: ldr s1, [sp, #44]
-; CHECK-NEXT: ldr q2, [sp] // 16-byte Folded Reload
-; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload
-; CHECK-NEXT: ld1 { v1.s }[1], [x19]
-; CHECK-NEXT: mov v2.s[2], v0.s[0]
-; CHECK-NEXT: ld1 { v1.s }[2], [x20]
-; CHECK-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload
-; CHECK-NEXT: mov v0.16b, v2.16b
-; CHECK-NEXT: add sp, sp, #80
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_modf_v3f32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: sub sp, sp, #80
+; CHECK-SD-NEXT: str x30, [sp, #48] // 8-byte Folded Spill
+; CHECK-SD-NEXT: stp x20, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 80
+; CHECK-SD-NEXT: .cfi_offset w19, -8
+; CHECK-SD-NEXT: .cfi_offset w20, -16
+; CHECK-SD-NEXT: .cfi_offset w30, -32
+; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT: mov s0, v0.s[1]
+; CHECK-SD-NEXT: add x0, sp, #56
+; CHECK-SD-NEXT: add x19, sp, #56
+; CHECK-SD-NEXT: bl modff
+; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT: add x0, sp, #44
+; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0
+; CHECK-SD-NEXT: bl modff
+; CHECK-SD-NEXT: ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT: add x0, sp, #60
+; CHECK-SD-NEXT: add x20, sp, #60
+; CHECK-SD-NEXT: mov v0.s[1], v1.s[0]
+; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT: ldr q0, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT: mov s0, v0.s[2]
+; CHECK-SD-NEXT: bl modff
+; CHECK-SD-NEXT: ldr s1, [sp, #44]
+; CHECK-SD-NEXT: ldr q2, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT: ldr x30, [sp, #48] // 8-byte Folded Reload
+; CHECK-SD-NEXT: ld1 { v1.s }[1], [x19]
+; CHECK-SD-NEXT: mov v2.s[2], v0.s[0]
+; CHECK-SD-NEXT: ld1 { v1.s }[2], [x20]
+; CHECK-SD-NEXT: ldp x20, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-SD-NEXT: mov v0.16b, v2.16b
+; CHECK-SD-NEXT: add sp, sp, #80
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_modf_v3f32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: sub sp, sp, #112
+; CHECK-GI-NEXT: stp d9, d8, [sp, #80] // 16-byte Folded Spill
+; CHECK-GI-NEXT: stp x30, x19, [sp, #96] // 16-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 112
+; CHECK-GI-NEXT: .cfi_offset w19, -8
+; CHECK-GI-NEXT: .cfi_offset w30, -16
+; CHECK-GI-NEXT: .cfi_offset b8, -24
+; CHECK-GI-NEXT: .cfi_offset b9, -32
+; CHECK-GI-NEXT: add x0, sp, #68
+; CHECK-GI-NEXT: mov s8, v0.s[1]
+; CHECK-GI-NEXT: mov s9, v0.s[2]
+; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $q0
+; CHECK-GI-NEXT: bl modff
+; CHECK-GI-NEXT: ldr s1, [sp, #68]
+; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT: add x0, sp, #72
+; CHECK-GI-NEXT: stp q0, q1, [sp, #32] // 32-byte Folded Spill
+; CHECK-GI-NEXT: fmov s0, s8
+; CHECK-GI-NEXT: bl modff
+; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT: add x0, sp, #76
+; CHECK-GI-NEXT: add x19, sp, #76
+; CHECK-GI-NEXT: ldr s0, [sp, #72]
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT: fmov s0, s9
+; CHECK-GI-NEXT: bl modff
+; CHECK-GI-NEXT: ldp q3, q2, [sp, #16] // 32-byte Folded Reload
+; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT: ldr q1, [sp] // 16-byte Folded Reload
+; CHECK-GI-NEXT: ldp d9, d8, [sp, #80] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v2.s[1], v1.s[0]
+; CHECK-GI-NEXT: ldr q1, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v1.s[1], v3.s[0]
+; CHECK-GI-NEXT: mov v2.s[2], v0.s[0]
+; CHECK-GI-NEXT: ld1 { v1.s }[2], [x19]
+; CHECK-GI-NEXT: ldp x30, x19, [sp, #96] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-NEXT: add sp, sp, #112
+; CHECK-GI-NEXT: ret
%result = call { <3 x float>, <3 x float> } @llvm.modf.v3f32(<3 x float> %a)
ret { <3 x float>, <3 x float> } %result
}
define { <2 x float>, <2 x float> } @test_modf_v2f32(<2 x float> %a) {
-; CHECK-LABEL: test_modf_v2f32:
-; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #64
-; CHECK-NEXT: stp x30, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 64
-; CHECK-NEXT: .cfi_offset w19, -8
-; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT: add x0, sp, #40
-; CHECK-NEXT: add x19, sp, #40
-; CHECK-NEXT: mov s0, v0.s[1]
-; CHECK-NEXT: bl modff
-; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: add x0, sp, #44
-; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT: // kill: def $s0 killed $s0 killed $q0
-; CHECK-NEXT: bl modff
-; CHECK-NEXT: ldr s1, [sp, #44]
-; CHECK-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: // kill: def $s0 killed $s0 def $q0
-; CHECK-NEXT: ld1 { v1.s }[1], [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: mov v0.s[1], v2.s[0]
-; CHECK-NEXT: // kill: def $d1 killed $d1 killed $q1
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT: add sp, sp, #64
-; CHECK-NEXT: ret
+; CHECK-SD-LABEL: test_modf_v2f32:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: sub sp, sp, #64
+; CHECK-SD-NEXT: stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT: .cfi_offset w19, -8
+; CHECK-SD-NEXT: .cfi_offset w30, -16
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT: add x0, sp, #40
+; CHECK-SD-NEXT: add x19, sp, #40
+; CHECK-SD-NEXT: mov s0, v0.s[1]
+; CHECK-SD-NEXT: bl modff
+; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT: add x0, sp, #44
+; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT: // kill: def $s0 killed $s0 killed $q0
+; CHECK-SD-NEXT: bl modff
+; CHECK-SD-NEXT: ldr s1, [sp, #44]
+; CHECK-SD-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT: // kill: def $s0 killed $s0 def $q0
+; CHECK-SD-NEXT: ld1 { v1.s }[1], [x19]
+; CHECK-SD-NEXT: ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT: mov v0.s[1], v2.s[0]
+; CHECK-SD-NEXT: // kill: def $d1 killed $d1 killed $q1
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT: add sp, sp, #64
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_modf_v2f32:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: sub sp, sp, #64
+; CHECK-GI-NEXT: str d8, [sp, #32] // 8-byte Folded Spill
+; CHECK-GI-NEXT: stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 64
+; CHECK-GI-NEXT: .cfi_offset w19, -8
+; CHECK-GI-NEXT: .cfi_offset w30, -16
+; CHECK-GI-NEXT: .cfi_offset b8, -32
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: add x0, sp, #40
+; CHECK-GI-NEXT: mov s8, v0.s[1]
+; CHECK-GI-NEXT: // kill: def $s0 killed $s0 killed $q0
+; CHECK-GI-NEXT: bl modff
+; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT: add x0, sp, #44
+; CHECK-GI-NEXT: add x19, sp, #44
+; CHECK-GI-NEXT: ldr s0, [sp, #40]
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT: fmov s0, s8
+; CHECK-GI-NEXT: bl modff
+; CHECK-GI-NEXT: ldp q2, q1, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT: // kill: def $s0 killed $s0 def $q0
+; CHECK-GI-NEXT: ldr d8, [sp, #32] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v2.s[1], v0.s[0]
+; CHECK-GI-NEXT: ld1 { v1.s }[1], [x19]
+; CHECK-GI-NEXT: ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-GI-NEXT: // kill: def $d1 killed $d1 killed $q1
+; CHECK-GI-NEXT: fmov d0, d2
+; CHECK-GI-NEXT: add sp, sp, #64
+; CHECK-GI-NEXT: ret
%result = call { <2 x float>, <2 x float> } @llvm.modf.v2f32(<2 x float> %a)
ret { <2 x float>, <2 x float> } %result
}
@@ -224,32 +335,80 @@ define { double, double } @test_modf_f64(double %a) {
}
define { <2 x double>, <2 x double> } @test_modf_v2f64(<2 x double> %a) {
-; CHECK-LABEL: test_modf_v2f64:
+; CHECK-SD-LABEL: test_modf_v2f64:
+; CHECK-SD: // %bb.0:
+; CHECK-SD-NEXT: sub sp, sp, #64
+; CHECK-SD-NEXT: stp x30, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-SD-NEXT: .cfi_def_cfa_offset 64
+; CHECK-SD-NEXT: .cfi_offset w19, -8
+; CHECK-SD-NEXT: .cfi_offset w30, -16
+; CHECK-SD-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-SD-NEXT: mov d0, v0.d[1]
+; CHECK-SD-NEXT: add x0, sp, #32
+; CHECK-SD-NEXT: add x19, sp, #32
+; CHECK-SD-NEXT: bl modf
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-SD-NEXT: add x0, sp, #40
+; CHECK-SD-NEXT: ldr q0, [sp] // 16-byte Folded Reload
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-SD-NEXT: bl modf
+; CHECK-SD-NEXT: ldr d1, [sp, #40]
+; CHECK-SD-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload
+; CHECK-SD-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-SD-NEXT: ld1 { v1.d }[1], [x19]
+; CHECK-SD-NEXT: ldp x30, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-SD-NEXT: mov v0.d[1], v2.d[0]
+; CHECK-SD-NEXT: add sp, sp, #64
+; CHECK-SD-NEXT: ret
+;
+; CHECK-GI-LABEL: test_modf_v2f64:
+; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: sub sp, sp, #80
+; CHECK-GI-NEXT: str d8, [sp, #48] // 8-byte Folded Spill
+; CHECK-GI-NEXT: stp x30, x19, [sp, #64] // 16-byte Folded Spill
+; CHECK-GI-NEXT: .cfi_def_cfa_offset 80
+; CHECK-GI-NEXT: .cfi_offset w19, -8
+; CHECK-GI-NEXT: .cfi_offset w30, -16
+; CHECK-GI-NEXT: .cfi_offset b8, -32
+; CHECK-GI-NEXT: add x0, sp, #40
+; CHECK-GI-NEXT: mov d8, v0.d[1]
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 killed $q0
+; CHECK-GI-NEXT: bl modf
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: str q0, [sp] // 16-byte Folded Spill
+; CHECK-GI-NEXT: add x0, sp, #56
+; CHECK-GI-NEXT: add x19, sp, #56
+; CHECK-GI-NEXT: ldr d0, [sp, #40]
+; CHECK-GI-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
+; CHECK-GI-NEXT: fmov d0, d8
+; CHECK-GI-NEXT: bl modf
+; CHECK-GI-NEXT: ldp q2, q1, [sp] // 32-byte Folded Reload
+; CHECK-GI-NEXT: // kill: def $d0 killed $d0 def $q0
+; CHECK-GI-NEXT: ldr d8, [sp, #48] // 8-byte Folded Reload
+; CHECK-GI-NEXT: mov v2.d[1], v0.d[0]
+; CHECK-GI-NEXT: ld1 { v1.d }[1], [x19]
+; CHECK-GI-NEXT: ldp x30, x19, [sp, #64] // 16-byte Folded Reload
+; CHECK-GI-NEXT: mov v0.16b, v2.16b
+; CHECK-GI-NEXT: add sp, sp, #80
+; CHECK-GI-NEXT: ret
+ %result = call { <2 x double>, <2 x double> } @llvm.modf.v2f64(<2 x double> %a)
+ ret { <2 x double>, <2 x double> } %result
+}
+
+define { fp128, fp128 } @test_modf_fp128(fp128 %a) {
+; CHECK-LABEL: test_modf_fp128:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #64
-; CHECK-NEXT: stp x30, x19, [sp, #48] // 16-byte Folded Spill
-; CHECK-NEXT: .cfi_def_cfa_offset 64
-; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: sub sp, sp, #32
+; CHECK-NEXT: str x30, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 32
; CHECK-NEXT: .cfi_offset w30, -16
-; CHECK-NEXT: str q0, [sp] // 16-byte Folded Spill
-; CHECK-NEXT: mov d0, v0.d[1]
-; CHECK-NEXT: add x0, sp, #32
-; CHECK-NEXT: add x19, sp, #32
-; CHECK-NEXT: bl modf
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: str q0, [sp, #16] // 16-byte Folded Spill
-; CHECK-NEXT: add x0, sp, #40
-; CHECK-NEXT: ldr q0, [sp] // 16-byte Folded Reload
-; CHECK-NEXT: // kill: def $d0 killed $d0 killed $q0
-; CHECK-NEXT: bl modf
-; CHECK-NEXT: ldr d1, [sp, #40]
-; CHECK-NEXT: ldr q2, [sp, #16] // 16-byte Folded Reload
-; CHECK-NEXT: // kill: def $d0 killed $d0 def $q0
-; CHECK-NEXT: ld1 { v1.d }[1], [x19]
-; CHECK-NEXT: ldp x30, x19, [sp, #48] // 16-byte Folded Reload
-; CHECK-NEXT: mov v0.d[1], v2.d[0]
-; CHECK-NEXT: add sp, sp, #64
+; CHECK-NEXT: mov x0, sp
+; CHECK-NEXT: bl modfl
+; CHECK-NEXT: ldr q1, [sp]
+; CHECK-NEXT: ldr x30, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #32
; CHECK-NEXT: ret
- %result = call { <2 x double>, <2 x double> } @llvm.modf.v2f64(<2 x double> %a)
- ret { <2 x double>, <2 x double> } %result
+ %result = call { fp128, fp128 } @llvm.modf.fp128(fp128 %a)
+ ret { fp128, fp128 } %result
}
diff --git a/llvm/test/CodeGen/AArch64/pr161420.ll b/llvm/test/CodeGen/AArch64/pr161420.ll
index 515a1bf..dcdf0ed 100644
--- a/llvm/test/CodeGen/AArch64/pr161420.ll
+++ b/llvm/test/CodeGen/AArch64/pr161420.ll
@@ -5,17 +5,20 @@ target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128-Fn32"
target triple = "arm64-apple-macosx15.0.0"
; From: https://github.com/llvm/llvm-project/issues/161420. This test checks that
-; two `luti4` instructions are emitted. FIXME: This is currently broken!
+; two `luti4` instructions are emitted.
define void @pluto(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3) #0 {
; CHECK-LABEL: pluto:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: mov w8, #0 ; =0x0
; CHECK-NEXT: ldr zt0, [x1]
-; CHECK-NEXT: ldr z0, [x3]
+; CHECK-NEXT: ldr z4, [x3]
; CHECK-NEXT: ptrue pn8.h
-; CHECK-NEXT: ld1h { z4.h - z7.h }, pn8/z, [x0]
-; CHECK-NEXT: luti4 { z0.h - z3.h }, zt0, z0[0]
-; CHECK-NEXT: fmla za.h[w8, 2, vgx4], { z4.h - z7.h }, { z0.h - z3.h }
+; CHECK-NEXT: ld1h { z0.h - z3.h }, pn8/z, [x0]
+; CHECK-NEXT: luti4 { z16.h - z19.h }, zt0, z4[0]
+; CHECK-NEXT: fmla za.h[w8, 0, vgx4], { z0.h - z3.h }, { z16.h - z19.h }
+; CHECK-NEXT: ldr zt0, [x2]
+; CHECK-NEXT: luti4 { z4.h - z7.h }, zt0, z4[0]
+; CHECK-NEXT: fmla za.h[w8, 2, vgx4], { z0.h - z3.h }, { z4.h - z7.h }
; CHECK-NEXT: ret
bb:
tail call void @llvm.aarch64.sme.ldr.zt(i32 0, ptr %arg1)
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll
index cf306e52..d48e0cd 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4-lane-x4.ll
@@ -49,10 +49,13 @@ define {<vscale x 4 x float>, <vscale x 4 x float>, <vscale x 4 x float>, <vscal
}
; Tests multiple identical luti4 intrinsics with ZT0 loads interspersed, are not CSD'd.
-; FIXME: This is currently broken!
define void @test_multiple_luti4_zt_i8(ptr %ptrA, ptr %ptrB, <vscale x 16 x i8> %x) {
; CHECK-LABEL: test_multiple_luti4_zt_i8:
; CHECK: // %bb.0:
+; CHECK-NEXT: ldr zt0, [x0]
+; CHECK-NEXT: luti4 { z4.s - z7.s }, zt0, z0[1]
+; CHECK-NEXT: // fake_use: $z4 $z4_z5_z6_z7
+; CHECK-NEXT: ldr zt0, [x1]
; CHECK-NEXT: luti4 { z0.s - z3.s }, zt0, z0[1]
; CHECK-NEXT: // fake_use: $z0 $z0_z1_z2_z3
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll
index 0024b70..c1eff8d 100644
--- a/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll
+++ b/llvm/test/CodeGen/AArch64/sme2-intrinsics-luti4.ll
@@ -15,12 +15,15 @@ define {<vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16 x i8>, <vscale x 16
}
; Tests multiple identical luti4 intrinsics with ZT0 loads interspersed, are not CSD'd.
-; FIXME: This is currently broken!
define void @test_multiple_luti4_zt_i8(ptr %ptrA, ptr %ptrB, <vscale x 16 x i8> %v0, <vscale x 16 x i8> %v1) #0 {
; CHECK-LABEL: test_multiple_luti4_zt_i8:
; CHECK: // %bb.0:
+; CHECK-NEXT: ldr zt0, [x0]
; CHECK-NEXT: // kill: def $z1 killed $z1 killed $z0_z1 def $z0_z1
; CHECK-NEXT: // kill: def $z0 killed $z0 killed $z0_z1 def $z0_z1
+; CHECK-NEXT: luti4 { z4.b - z7.b }, zt0, { z0, z1 }
+; CHECK-NEXT: // fake_use: $z4 $z4_z5_z6_z7
+; CHECK-NEXT: ldr zt0, [x1]
; CHECK-NEXT: luti4 { z0.b - z3.b }, zt0, { z0, z1 }
; CHECK-NEXT: // fake_use: $z0 $z0_z1_z2_z3
; CHECK-NEXT: ret
diff --git a/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir b/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
index bff0cac..0298168 100644
--- a/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
+++ b/llvm/test/CodeGen/AArch64/spill-fill-zpr-predicates.mir
@@ -983,26 +983,22 @@ body: |
; EXPAND-LABEL: name: zpr_predicate_spill_p4_saved
; EXPAND: liveins: $p0, $p1, $p2, $p3, $fp, $p8, $p4
; EXPAND-NEXT: {{ $}}
- ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1040, 0
- ; EXPAND-NEXT: frame-setup STRXui killed $fp, $sp, 128 :: (store (s64) into %stack.3)
+ ; EXPAND-NEXT: early-clobber $sp = frame-setup STRXpre killed $fp, $sp, -16 :: (store (s64) into %stack.2)
; EXPAND-NEXT: $sp = frame-setup ADDVL_XXI $sp, -2, implicit $vg
; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p8, 1, 0
- ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.2)
+ ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 0 :: (store (s128) into %stack.1)
; EXPAND-NEXT: $z0 = frame-setup CPY_ZPzI_B killed $p4, 1, 0
- ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.1)
- ; EXPAND-NEXT: $sp = frame-setup SUBXri $sp, 1024, 0
+ ; EXPAND-NEXT: frame-setup STR_ZXI $z0, $sp, 1 :: (store (s128) into %stack.0)
;
; EXPAND-NEXT: $p8 = IMPLICIT_DEF
;
- ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1024, 0
- ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.2)
+ ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 0 :: (load (s128) from %stack.1)
; EXPAND-NEXT: $p4 = frame-destroy PTRUE_B 31, implicit $vg
; EXPAND-NEXT: $p8 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
- ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.1)
+ ; EXPAND-NEXT: $z0 = frame-destroy LDR_ZXI $sp, 1 :: (load (s128) from %stack.0)
; EXPAND-NEXT: $p4 = frame-destroy CMPNE_PPzZI_B $p4, $z0, 0, implicit-def $nzcv, implicit-def $nzcv
; EXPAND-NEXT: $sp = frame-destroy ADDVL_XXI $sp, 2, implicit $vg
- ; EXPAND-NEXT: $fp = frame-destroy LDRXui $sp, 128 :: (load (s64) from %stack.3)
- ; EXPAND-NEXT: $sp = frame-destroy ADDXri $sp, 1040, 0
+ ; EXPAND-NEXT: early-clobber $sp, $fp = frame-destroy LDRXpost $sp, 16 :: (load (s64) from %stack.2)
; EXPAND-NEXT: RET undef $lr, implicit $p0, implicit $p1, implicit $p2, implicit $p3
; If we spill a register above p8, p4 must also be saved, so we can guarantee
diff --git a/llvm/test/CodeGen/AArch64/spillfill-sve.mir b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
index 2b16dd0f..5569175 100644
--- a/llvm/test/CodeGen/AArch64/spillfill-sve.mir
+++ b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
@@ -39,7 +39,7 @@ body: |
; CHECK-LABEL: name: spills_fills_stack_id_ppr
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2
- ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: ''
+ ; CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: ''
; EXPAND-LABEL: name: spills_fills_stack_id_ppr
; EXPAND: STR_PXI $p0, $sp, 7
@@ -82,7 +82,7 @@ body: |
; CHECK-LABEL: name: spills_fills_stack_id_ppr2
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 2
- ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: ''
+ ; CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: ''
; EXPAND-LABEL: name: spills_fills_stack_id_ppr2
; EXPAND: STR_PXI $p0, $sp, 6
@@ -127,7 +127,7 @@ body: |
; CHECK-LABEL: name: spills_fills_stack_id_ppr2
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 2
- ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: ''
+ ; CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: ''
; EXPAND-LABEL: name: spills_fills_stack_id_ppr2mul2
; EXPAND: STR_PXI $p0, $sp, 6
@@ -172,7 +172,7 @@ body: |
; CHECK-LABEL: name: spills_fills_stack_id_pnr
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2
- ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: ''
+ ; CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: ''
; EXPAND-LABEL: name: spills_fills_stack_id_pnr
; EXPAND: STR_PXI $pn0, $sp, 7
@@ -211,7 +211,7 @@ body: |
; CHECK-LABEL: name: spills_fills_stack_id_virtreg_pnr
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: spill-slot, offset: 0, size: 2, alignment: 2
- ; CHECK-NEXT: stack-id: scalable-vector, callee-saved-register: ''
+ ; CHECK-NEXT: stack-id: scalable-predicate-vector, callee-saved-register: ''
; EXPAND-LABEL: name: spills_fills_stack_id_virtreg_pnr
; EXPAND: renamable $pn8 = WHILEGE_CXX_B
diff --git a/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
new file mode 100644
index 0000000..690a39d
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/split-sve-stack-frame-layout.ll
@@ -0,0 +1,824 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 | FileCheck %s
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-in-non-streaming -aarch64-split-sve-objects -aarch64-streaming-hazard-size=1024 -pass-remarks-analysis=stack-frame-layout 2>&1 >/dev/null | FileCheck %s --check-prefixes=CHECK-FRAMELAYOUT
+
+; CHECK-FRAMELAYOUT-LABEL: Function: zpr_and_ppr_local
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024
+
+; <GPRs>
+; %ppr_local sp+2048+30*vscale (= #15, mul vl for str/ldr PPR)
+; 14 x vscale bytes of padding sp+2048+16*vscale
+; <hazard padding> sp+1024+16*vscale
+; %zpr_local sp+1024
+; <hazard padding>
+; -> sp
+define void @zpr_and_ppr_local(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: zpr_and_ppr_local:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0x90, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2064 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: add x8, sp, #2048
+; CHECK-NEXT: str p0, [x8, #15, mul vl]
+; CHECK-NEXT: add x8, sp, #1024
+; CHECK-NEXT: str z0, [x8]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %ppr_local = alloca <vscale x 16 x i1>
+ %zpr_local = alloca <vscale x 16 x i8>
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+ ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: zpr_and_ppr_local_fp
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024
+
+; <GPRs>
+; -> fp
+; %ppr_local fp-2*vscale (= #-1, mul vl for str/ldr PPR)
+; 14 x vscale bytes of padding fp-16*vscale
+; <hazard padding> fp-1024-16*vscale
+; %zpr_local fp-1024-32*vscale (= #-2, mul vl for str/ldr ZPR)
+; <hazard padding>
+; -> sp
+define void @zpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector) "aarch64_pstate_sm_compatible" "frame-pointer"="all" {
+; CHECK-LABEL: zpr_and_ppr_local_fp:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: .cfi_def_cfa w29, 16
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: sub x8, x29, #1024
+; CHECK-NEXT: str p0, [x29, #-1, mul vl]
+; CHECK-NEXT: str z0, [x8, #-2, mul vl]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %ppr_local = alloca <vscale x 16 x i1>
+ %zpr_local = alloca <vscale x 16 x i8>
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+ ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: fpr_and_ppr_local
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1048-16 x vscale], Type: Variable, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2080-16 x vscale], Type: Variable, Align: 16, Size: 1024
+
+; <GPRs>
+; %ppr_local sp+2064+14*vscale (= #7, mul vl for str/ldr PPR)
+; 14 x vscale bytes of padding sp+2064
+; <hazard padding> sp+1040
+; %fpr_local sp+1032
+; 8 bytes of padding sp+1024
+; <hazard padding>
+; -> sp
+define void @fpr_and_ppr_local(<vscale x 16 x i1> %pred, double %double) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: fpr_and_ppr_local:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: sub sp, sp, #1040
+; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x38, 0x1e, 0x22 // sp + 2080 + 8 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: add x8, sp, #2064
+; CHECK-NEXT: str p0, [x8, #7, mul vl]
+; CHECK-NEXT: str d0, [sp, #1032]
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1040
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %ppr_local = alloca <vscale x 16 x i1>
+ %fpr_local = alloca double
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile double %double, ptr %fpr_local
+ ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: fpr_and_ppr_local_fp
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1048-16 x vscale], Type: Variable, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2080-16 x vscale], Type: Variable, Align: 16, Size: 1024
+
+; <GPRs>
+; -> fp
+; %ppr_local fp-2*vscale (= #-1, mul vl for str/ldr PPR)
+; 14 x vscale bytes of padding
+; <hazard padding>
+; %fpr_local sp+1032
+; 8 bytes of padding sp+1024
+; <hazard padding>
+; -> sp
+define void @fpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, double %double) "aarch64_pstate_sm_compatible" "frame-pointer"="all" {
+; CHECK-LABEL: fpr_and_ppr_local_fp:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: sub sp, sp, #1040
+; CHECK-NEXT: .cfi_def_cfa w29, 16
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: str p0, [x29, #-1, mul vl]
+; CHECK-NEXT: str d0, [sp, #1032]
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1040
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %ppr_local = alloca <vscale x 16 x i1>
+ %fpr_local = alloca double
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile double %double, ptr %fpr_local
+ ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: gpr_and_ppr_local
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2072-32 x vscale], Type: Variable, Align: 8, Size: 8
+
+; <CS GPRs>
+; %ppr_local sp+2064+30*vscale (= #15, mul vl for str/ldr PPR)
+; 14 x vscale bytes of padding
+; <hazard padding> sp+1040+16*vscale
+; <fpr callee save: z8> sp+1040
+; <hazard padding> sp+16
+; %gpr_local sp+8
+; 8 bytes of padding
+; -> sp
+define void @gpr_and_ppr_local(<vscale x 16 x i1> %pred, i64 %int) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: gpr_and_ppr_local:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1040
+; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xa0, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2080 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1040
+; CHECK-NEXT: add x8, sp, #2064
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: str p0, [x8, #15, mul vl]
+; CHECK-NEXT: str x0, [sp, #8]
+; CHECK-NEXT: add sp, sp, #1040
+; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ tail call void asm sideeffect "", "~{d8}"() #1 ; Spill an FPR so hazard padding is needed
+ %ppr_local = alloca <vscale x 16 x i1>
+ %gpr_local = alloca i64
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile i64 %int, ptr %gpr_local
+ ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: gpr_and_ppr_local_fp
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-32 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2064-32 x vscale], Type: Variable, Align: 16, Size: 1024
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2072-32 x vscale], Type: Variable, Align: 8, Size: 8
+
+; <CS GPRs>
+; -> fp
+; %ppr_local fp-2*vscale (= #-1, mul vl for str/ldr PPR)
+; 14 x vscale bytes of padding
+; <hazard padding>
+; <fpr callee save: z8>
+; <hazard padding>
+; %gpr_local sp+8
+; 8 bytes of padding
+; -> sp
+define void @gpr_and_ppr_local_fp(<vscale x 16 x i1> %pred, i64 %int) "aarch64_pstate_sm_compatible" "frame-pointer"="all" {
+; CHECK-LABEL: gpr_and_ppr_local_fp:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: str z8, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1040
+; CHECK-NEXT: .cfi_def_cfa w29, 16
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 16 * VG - 1040
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: str p0, [x29, #-1, mul vl]
+; CHECK-NEXT: str x0, [sp, #8]
+; CHECK-NEXT: add sp, sp, #1040
+; CHECK-NEXT: ldr z8, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ tail call void asm sideeffect "", "~{d8}"() #1 ; Spill an FPR so hazard padding is needed
+ %ppr_local = alloca <vscale x 16 x i1>
+ %gpr_local = alloca i64
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile i64 %int, ptr %gpr_local
+ ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: all_stack_areas
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-2 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-4 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-6 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-8 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-10 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-12 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-14 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-16 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-18 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-20 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-22 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-24 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16-34 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-64 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-80 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-96 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-112 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-128 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-144 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-160 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-176 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-192 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-208 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-224 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-240 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-256 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-272 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-288 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-304 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1040-320 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1048-320 x vscale], Type: Variable, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2080-320 x vscale], Type: Variable, Align: 16, Size: 1024
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2088-320 x vscale], Type: Variable, Align: 8, Size: 8
+
+; <CS GPRs>
+; <CS PPRs>
+; %ppr_local sp+2080+286*vscale (addvl #17, addpl #7)
+; 14 * vscale bytes of padding sp+2080+272*vscale
+; <hazard padding> sp+1056+272*vscale
+; <CS ZPRs> sp+1056+16*vscale
+; %zpr_local sp+1056
+; %fpr_local sp+1048
+; 8 bytes of padding sp+1040
+; <hazard padding> sp+16
+; %gpr_local sp+8
+; 8 bytes of padding sp
+; -> sp
+define void @all_stack_areas(<vscale x 16 x i1> %pred, double %fp) {
+; CHECK-LABEL: all_stack_areas:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-17
+; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1056
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0b, 0x8f, 0xb0, 0x10, 0x92, 0x2e, 0x00, 0x11, 0xa0, 0x01, 0x1e, 0x22 // sp + 2096 + 160 * VG
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d8 @ cfa - 32 * VG - 1040
+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d9 @ cfa - 40 * VG - 1040
+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d10 @ cfa - 48 * VG - 1040
+; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d11 @ cfa - 56 * VG - 1040
+; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d12 @ cfa - 64 * VG - 1040
+; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d13 @ cfa - 72 * VG - 1040
+; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d14 @ cfa - 80 * VG - 1040
+; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xa8, 0x7f, 0x1e, 0x22, 0x11, 0xf0, 0x77, 0x22 // $d15 @ cfa - 88 * VG - 1040
+; CHECK-NEXT: add x0, sp, #2080
+; CHECK-NEXT: add x8, sp, #2080
+; CHECK-NEXT: add x1, sp, #1056
+; CHECK-NEXT: addvl x0, x0, #17
+; CHECK-NEXT: add x2, sp, #1048
+; CHECK-NEXT: add x3, sp, #8
+; CHECK-NEXT: addpl x0, x0, #7
+; CHECK-NEXT: str d0, [sp, #1048]
+; CHECK-NEXT: str p0, [x8, #143, mul vl]
+; CHECK-NEXT: bl foo
+; CHECK-NEXT: add sp, sp, #1056
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #17
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %ppr_local = alloca <vscale x 16 x i1>
+ %zpr_local = alloca <vscale x 16 x i8>
+ %fpr_local = alloca double
+ ; // Needed to sort %fpr_local into the FPR region
+ store double %fp, ptr %fpr_local
+ ; // Needed to sort %ppr_local into the PPR region
+ store <vscale x 16 x i1> %pred, ptr %ppr_local
+ %gpr_local = alloca i64
+ call void @foo(ptr %ppr_local, ptr %zpr_local, ptr %fpr_local, ptr %gpr_local)
+ ret void
+}
+declare void @foo(ptr, ptr, ptr, ptr)
+
+; CHECK-FRAMELAYOUT-LABEL: Function: all_stack_areas_fp
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-24], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-2 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-4 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-6 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-8 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-10 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-12 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-14 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-16 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-18 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-20 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-22 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-24 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32-34 x vscale], Type: Variable, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-64 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-80 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-96 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-112 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-128 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-144 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-160 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-176 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-192 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-208 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-224 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-240 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-256 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-272 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-288 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-304 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1056-320 x vscale], Type: Variable, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1064-320 x vscale], Type: Variable, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2096-320 x vscale], Type: Variable, Align: 16, Size: 1024
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2104-320 x vscale], Type: Variable, Align: 8, Size: 8
+
+; <CS GPRs>
+; -> fp
+; <CS PPRs> fp-32*vscale
+; %ppr_local fp-34*vscale (addpl #-17)
+; 14 * vscale bytes of padding fp-48*vscale
+; <hazard padding> fp-1024-48*vscale
+; <CS ZPRs> fp-1024-304*vscale
+; %zpr_local sp-1024-320*vscale (addvl #-20)
+; %fpr_local sp+1048
+; 8 bytes of padding sp+1040
+; <hazard padding> sp+16
+; %gpr_local sp+8
+; 8 bytes of padding sp
+; -> sp
+define void @all_stack_areas_fp(<vscale x 16 x i1> %pred, double %fp) "frame-pointer"="all" {
+; CHECK-LABEL: all_stack_areas_fp:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-32]! // 16-byte Folded Spill
+; CHECK-NEXT: str x28, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-17
+; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1056
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: .cfi_def_cfa w29, 32
+; CHECK-NEXT: .cfi_offset w28, -16
+; CHECK-NEXT: .cfi_offset w30, -24
+; CHECK-NEXT: .cfi_offset w29, -32
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d8 @ cfa - 32 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d9 @ cfa - 40 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d10 @ cfa - 48 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d11 @ cfa - 56 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0b, 0x92, 0x2e, 0x00, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d12 @ cfa - 64 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d13 @ cfa - 72 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d14 @ cfa - 80 * VG - 1056
+; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0c, 0x92, 0x2e, 0x00, 0x11, 0xa8, 0x7f, 0x1e, 0x22, 0x11, 0xe0, 0x77, 0x22 // $d15 @ cfa - 88 * VG - 1056
+; CHECK-NEXT: sub x1, x29, #1024
+; CHECK-NEXT: addpl x0, x29, #-17
+; CHECK-NEXT: add x2, sp, #1048
+; CHECK-NEXT: addvl x1, x1, #-20
+; CHECK-NEXT: add x3, sp, #8
+; CHECK-NEXT: str d0, [sp, #1048]
+; CHECK-NEXT: str p0, [x29, #-17, mul vl]
+; CHECK-NEXT: bl foo
+; CHECK-NEXT: add sp, sp, #1056
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #17
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x28, [sp, #16] // 8-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #32 // 16-byte Folded Reload
+; CHECK-NEXT: ret
+ %ppr_local = alloca <vscale x 16 x i1>
+ %zpr_local = alloca <vscale x 16 x i8>
+ %fpr_local = alloca double
+ ; // Needed to sort %fpr_local into the FPR region
+ store double %fp, ptr %fpr_local
+ ; // Needed to sort %ppr_local into the PPR region
+ store <vscale x 16 x i1> %pred, ptr %ppr_local
+ %gpr_local = alloca i64
+ call void @foo(ptr %ppr_local, ptr %zpr_local, ptr %fpr_local, ptr %gpr_local)
+ ret void
+}
+
+; CHECK-FRAMELAYOUT-LABEL: Function: svecc_call
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-8], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-16], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-24], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-32], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-48], Type: Spill, Align: 16, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-56], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64], Type: Spill, Align: 8, Size: 8
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-2 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-4 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-6 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-8 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-10 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-12 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-14 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-16 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-18 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-20 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-22 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-64-24 x vscale], Type: Spill, Align: 2, Size: vscale x 2
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-48 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-64 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-80 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-96 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-112 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-128 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-144 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-160 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-176 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-192 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-208 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-224 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-240 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-256 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-272 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-1088-288 x vscale], Type: Spill, Align: 16, Size: vscale x 16
+; CHECK-FRAMELAYOUT-NEXT: Offset: [SP-2112-288 x vscale], Type: Variable, Align: 16, Size: 1024
+
+define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3, i16 %P4) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: svecc_call:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_def_cfa_offset 64
+; CHECK-NEXT: cntd x9
+; CHECK-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK-NEXT: mov x29, sp
+; CHECK-NEXT: .cfi_def_cfa w29, 64
+; CHECK-NEXT: .cfi_offset w19, -8
+; CHECK-NEXT: .cfi_offset w26, -16
+; CHECK-NEXT: .cfi_offset w27, -24
+; CHECK-NEXT: .cfi_offset w28, -32
+; CHECK-NEXT: .cfi_offset vg, -48
+; CHECK-NEXT: .cfi_offset w30, -56
+; CHECK-NEXT: .cfi_offset w29, -64
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-16
+; CHECK-NEXT: str z23, [sp] // 16-byte Folded Spill
+; CHECK-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088
+; CHECK-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088
+; CHECK-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088
+; CHECK-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088
+; CHECK-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088
+; CHECK-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088
+; CHECK-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088
+; CHECK-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: mov x8, x0
+; CHECK-NEXT: bl __arm_sme_state
+; CHECK-NEXT: mov x19, x0
+; CHECK-NEXT: //APP
+; CHECK-NEXT: //NO_APP
+; CHECK-NEXT: tbz w19, #0, .LBB8_2
+; CHECK-NEXT: // %bb.1: // %entry
+; CHECK-NEXT: smstop sm
+; CHECK-NEXT: .LBB8_2: // %entry
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: mov w1, #45 // =0x2d
+; CHECK-NEXT: mov w2, #37 // =0x25
+; CHECK-NEXT: bl memset
+; CHECK-NEXT: tbz w19, #0, .LBB8_4
+; CHECK-NEXT: // %bb.3: // %entry
+; CHECK-NEXT: smstart sm
+; CHECK-NEXT: .LBB8_4: // %entry
+; CHECK-NEXT: mov w0, #22647 // =0x5877
+; CHECK-NEXT: movk w0, #59491, lsl #16
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: ldr z23, [sp] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #16
+; CHECK-NEXT: .cfi_restore z8
+; CHECK-NEXT: .cfi_restore z9
+; CHECK-NEXT: .cfi_restore z10
+; CHECK-NEXT: .cfi_restore z11
+; CHECK-NEXT: .cfi_restore z12
+; CHECK-NEXT: .cfi_restore z13
+; CHECK-NEXT: .cfi_restore z14
+; CHECK-NEXT: .cfi_restore z15
+; CHECK-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: .cfi_def_cfa wsp, 64
+; CHECK-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: .cfi_restore w19
+; CHECK-NEXT: .cfi_restore w26
+; CHECK-NEXT: .cfi_restore w27
+; CHECK-NEXT: .cfi_restore w28
+; CHECK-NEXT: .cfi_restore vg
+; CHECK-NEXT: .cfi_restore w30
+; CHECK-NEXT: .cfi_restore w29
+; CHECK-NEXT: ret
+entry:
+ tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
+ %call = call ptr @memset(ptr noundef nonnull %P1, i32 noundef 45, i32 noundef 37)
+ ret i32 -396142473
+}
+declare ptr @memset(ptr, i32, i32)
+
+; FIXME: aarch64-split-sve-objects is currently not supported in this function
+; as it requires stack reealignment (for the 32-byte aligned alloca).
+; GPR CSRs
+; <hazard padding>
+; FPR CSRs
+; <hazrd padding>
+; <SVE locals (PPRs and ZPRs)> <--- hazard between PPRs and ZPRs here!
+; <realignment padding>
+; -> sp
+define void @zpr_and_ppr_local_realignment(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector, i64 %gpr) "aarch64_pstate_sm_compatible" {
+; CHECK-LABEL: zpr_and_ppr_local_realignment:
+; CHECK: // %bb.0:
+; CHECK-NEXT: sub sp, sp, #1040
+; CHECK-NEXT: sub x9, sp, #1040
+; CHECK-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK-NEXT: add x29, sp, #1024
+; CHECK-NEXT: addvl x9, x9, #-2
+; CHECK-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK-NEXT: and sp, x9, #0xffffffffffffffe0
+; CHECK-NEXT: .cfi_def_cfa w29, 16
+; CHECK-NEXT: .cfi_offset w30, -8
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: sub x8, x29, #1024
+; CHECK-NEXT: str p0, [x8, #-1, mul vl]
+; CHECK-NEXT: str z0, [x8, #-2, mul vl]
+; CHECK-NEXT: str x0, [sp]
+; CHECK-NEXT: sub sp, x29, #1024
+; CHECK-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK-NEXT: add sp, sp, #1040
+; CHECK-NEXT: ret
+ %ppr_local = alloca <vscale x 16 x i1>
+ %zpr_local = alloca <vscale x 16 x i8>
+ %gpr_local = alloca i64, align 32
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+ store volatile i64 %gpr, ptr %gpr_local
+ ret void
+}
+
+define void @zpr_and_ppr_local_stack_probing(<vscale x 16 x i1> %pred, <vscale x 16 x i8> %vector, i64 %gpr)
+; CHECK-LABEL: zpr_and_ppr_local_stack_probing:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: sub sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: sub sp, sp, #1824
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str xzr, [sp]
+; CHECK-NEXT: .cfi_escape 0x0f, 0x09, 0x8f, 0xb0, 0x16, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 2864 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: add x8, sp, #2848
+; CHECK-NEXT: str p0, [x8, #15, mul vl]
+; CHECK-NEXT: add x8, sp, #1824
+; CHECK-NEXT: str z0, [x8]
+; CHECK-NEXT: str x0, [sp]
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1024
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: add sp, sp, #1824
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ "probe-stack"="inline-asm" "stack-probe-size"="4096" "frame-pointer"="none" "aarch64_pstate_sm_compatible"
+{
+ %ppr_local = alloca <vscale x 16 x i1>
+ %zpr_local = alloca <vscale x 16 x i8>
+ %gpr_local = alloca i64, i64 100, align 8
+ store volatile <vscale x 16 x i1> %pred, ptr %ppr_local
+ store volatile <vscale x 16 x i8> %vector, ptr %zpr_local
+ store volatile i64 %gpr, ptr %gpr_local
+ ret void
+}
diff --git a/llvm/test/CodeGen/AArch64/stack-hazard.ll b/llvm/test/CodeGen/AArch64/stack-hazard.ll
index 5f52280..333a8be 100644
--- a/llvm/test/CodeGen/AArch64/stack-hazard.ll
+++ b/llvm/test/CodeGen/AArch64/stack-hazard.ll
@@ -1,7 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=0 | FileCheck %s --check-prefixes=CHECK,CHECK0
; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=64 | FileCheck %s --check-prefixes=CHECK,CHECK64
-; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-NOSPLITSVE
+; RUN: llc < %s -mtriple=aarch64 -mattr=+sve2 -aarch64-split-sve-objects -aarch64-stack-hazard-size=1024 | FileCheck %s --check-prefixes=CHECK,CHECK1024,CHECK1024-SPLITSVE
define i32 @basic(i32 noundef %num) {
; CHECK-LABEL: basic:
@@ -1503,72 +1504,24 @@ define [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1([2 x <vscale x 4 x i1>
}
define [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1_caller([2 x <vscale x 4 x i1>] %arg1, [2 x <vscale x 4 x i1>] %arg2) nounwind "aarch64_pstate_sm_compatible" {
-; CHECK0-LABEL: sve_signature_pred_2xv4i1_caller:
-; CHECK0: // %bb.0:
-; CHECK0-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
-; CHECK0-NEXT: addvl sp, sp, #-1
-; CHECK0-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT: mov p5.b, p0.b
-; CHECK0-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK0-NEXT: mov p4.b, p1.b
-; CHECK0-NEXT: mov p0.b, p2.b
-; CHECK0-NEXT: mov p1.b, p3.b
-; CHECK0-NEXT: mov p2.b, p5.b
-; CHECK0-NEXT: mov p3.b, p4.b
-; CHECK0-NEXT: bl sve_signature_pred_2xv4i1
-; CHECK0-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK0-NEXT: addvl sp, sp, #1
-; CHECK0-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
-; CHECK0-NEXT: ret
-;
-; CHECK64-LABEL: sve_signature_pred_2xv4i1_caller:
-; CHECK64: // %bb.0:
-; CHECK64-NEXT: sub sp, sp, #80
-; CHECK64-NEXT: stp x29, x30, [sp, #64] // 16-byte Folded Spill
-; CHECK64-NEXT: addvl sp, sp, #-1
-; CHECK64-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK64-NEXT: sub sp, sp, #64
-; CHECK64-NEXT: mov p4.b, p1.b
-; CHECK64-NEXT: mov p5.b, p0.b
-; CHECK64-NEXT: mov p0.b, p2.b
-; CHECK64-NEXT: mov p1.b, p3.b
-; CHECK64-NEXT: mov p2.b, p5.b
-; CHECK64-NEXT: mov p3.b, p4.b
-; CHECK64-NEXT: bl sve_signature_pred_2xv4i1
-; CHECK64-NEXT: add sp, sp, #64
-; CHECK64-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK64-NEXT: addvl sp, sp, #1
-; CHECK64-NEXT: ldp x29, x30, [sp, #64] // 16-byte Folded Reload
-; CHECK64-NEXT: add sp, sp, #80
-; CHECK64-NEXT: ret
-;
-; CHECK1024-LABEL: sve_signature_pred_2xv4i1_caller:
-; CHECK1024: // %bb.0:
-; CHECK1024-NEXT: sub sp, sp, #1040
-; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT: addvl sp, sp, #-1
-; CHECK1024-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: sub sp, sp, #1024
-; CHECK1024-NEXT: mov p4.b, p1.b
-; CHECK1024-NEXT: mov p5.b, p0.b
-; CHECK1024-NEXT: mov p0.b, p2.b
-; CHECK1024-NEXT: mov p1.b, p3.b
-; CHECK1024-NEXT: mov p2.b, p5.b
-; CHECK1024-NEXT: mov p3.b, p4.b
-; CHECK1024-NEXT: bl sve_signature_pred_2xv4i1
-; CHECK1024-NEXT: add sp, sp, #1024
-; CHECK1024-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: addvl sp, sp, #1
-; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT: add sp, sp, #1040
-; CHECK1024-NEXT: ret
+; CHECK-LABEL: sve_signature_pred_2xv4i1_caller:
+; CHECK: // %bb.0:
+; CHECK-NEXT: stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-1
+; CHECK-NEXT: str p5, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p5.b, p0.b
+; CHECK-NEXT: str p4, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK-NEXT: mov p4.b, p1.b
+; CHECK-NEXT: mov p0.b, p2.b
+; CHECK-NEXT: mov p1.b, p3.b
+; CHECK-NEXT: mov p2.b, p5.b
+; CHECK-NEXT: mov p3.b, p4.b
+; CHECK-NEXT: bl sve_signature_pred_2xv4i1
+; CHECK-NEXT: ldr p5, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: ldr p4, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK-NEXT: addvl sp, sp, #1
+; CHECK-NEXT: ldp x29, x30, [sp], #16 // 16-byte Folded Reload
+; CHECK-NEXT: ret
%res = call [2 x <vscale x 4 x i1>] @sve_signature_pred_2xv4i1([2 x <vscale x 4 x i1>] %arg2, [2 x <vscale x 4 x i1>] %arg1)
ret [2 x <vscale x 4 x i1>] %res
}
@@ -2113,139 +2066,269 @@ define i32 @svecc_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8> %P3,
; CHECK64-NEXT: .cfi_restore w29
; CHECK64-NEXT: ret
;
-; CHECK1024-LABEL: svecc_call:
-; CHECK1024: // %bb.0: // %entry
-; CHECK1024-NEXT: sub sp, sp, #1088
-; CHECK1024-NEXT: .cfi_def_cfa_offset 1088
-; CHECK1024-NEXT: cntd x9
-; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill
-; CHECK1024-NEXT: add x29, sp, #1024
-; CHECK1024-NEXT: .cfi_def_cfa w29, 64
-; CHECK1024-NEXT: .cfi_offset w19, -16
-; CHECK1024-NEXT: .cfi_offset w26, -24
-; CHECK1024-NEXT: .cfi_offset w27, -32
-; CHECK1024-NEXT: .cfi_offset w28, -40
-; CHECK1024-NEXT: .cfi_offset vg, -48
-; CHECK1024-NEXT: .cfi_offset w30, -56
-; CHECK1024-NEXT: .cfi_offset w29, -64
-; CHECK1024-NEXT: addvl sp, sp, #-18
-; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
-; CHECK1024-NEXT: sub sp, sp, #1024
-; CHECK1024-NEXT: mov x8, x0
-; CHECK1024-NEXT: bl __arm_sme_state
-; CHECK1024-NEXT: mov x19, x0
-; CHECK1024-NEXT: //APP
-; CHECK1024-NEXT: //NO_APP
-; CHECK1024-NEXT: tbz w19, #0, .LBB28_2
-; CHECK1024-NEXT: // %bb.1: // %entry
-; CHECK1024-NEXT: smstop sm
-; CHECK1024-NEXT: .LBB28_2: // %entry
-; CHECK1024-NEXT: mov x0, x8
-; CHECK1024-NEXT: mov w1, #45 // =0x2d
-; CHECK1024-NEXT: mov w2, #37 // =0x25
-; CHECK1024-NEXT: bl memset
-; CHECK1024-NEXT: tbz w19, #0, .LBB28_4
-; CHECK1024-NEXT: // %bb.3: // %entry
-; CHECK1024-NEXT: smstart sm
-; CHECK1024-NEXT: .LBB28_4: // %entry
-; CHECK1024-NEXT: mov w0, #22647 // =0x5877
-; CHECK1024-NEXT: movk w0, #59491, lsl #16
-; CHECK1024-NEXT: add sp, sp, #1024
-; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: addvl sp, sp, #18
-; CHECK1024-NEXT: .cfi_restore z8
-; CHECK1024-NEXT: .cfi_restore z9
-; CHECK1024-NEXT: .cfi_restore z10
-; CHECK1024-NEXT: .cfi_restore z11
-; CHECK1024-NEXT: .cfi_restore z12
-; CHECK1024-NEXT: .cfi_restore z13
-; CHECK1024-NEXT: .cfi_restore z14
-; CHECK1024-NEXT: .cfi_restore z15
-; CHECK1024-NEXT: .cfi_def_cfa wsp, 1088
-; CHECK1024-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT: add sp, sp, #1088
-; CHECK1024-NEXT: .cfi_def_cfa_offset 0
-; CHECK1024-NEXT: .cfi_restore w19
-; CHECK1024-NEXT: .cfi_restore w26
-; CHECK1024-NEXT: .cfi_restore w27
-; CHECK1024-NEXT: .cfi_restore w28
-; CHECK1024-NEXT: .cfi_restore vg
-; CHECK1024-NEXT: .cfi_restore w30
-; CHECK1024-NEXT: .cfi_restore w29
-; CHECK1024-NEXT: ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_call:
+; CHECK1024-NOSPLITSVE: // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 1088
+; CHECK1024-NOSPLITSVE-NEXT: cntd x9
+; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: add x29, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa w29, 64
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w19, -16
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w26, -24
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w27, -32
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w28, -40
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset vg, -48
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w30, -56
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -64
+; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-18
+; CHECK1024-NOSPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT: mov x8, x0
+; CHECK1024-NOSPLITSVE-NEXT: bl __arm_sme_state
+; CHECK1024-NOSPLITSVE-NEXT: mov x19, x0
+; CHECK1024-NOSPLITSVE-NEXT: //APP
+; CHECK1024-NOSPLITSVE-NEXT: //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB28_2
+; CHECK1024-NOSPLITSVE-NEXT: // %bb.1: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: smstop sm
+; CHECK1024-NOSPLITSVE-NEXT: .LBB28_2: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: mov x0, x8
+; CHECK1024-NOSPLITSVE-NEXT: mov w1, #45 // =0x2d
+; CHECK1024-NOSPLITSVE-NEXT: mov w2, #37 // =0x25
+; CHECK1024-NOSPLITSVE-NEXT: bl memset
+; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB28_4
+; CHECK1024-NOSPLITSVE-NEXT: // %bb.3: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: smstart sm
+; CHECK1024-NOSPLITSVE-NEXT: .LBB28_4: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: mov w0, #22647 // =0x5877
+; CHECK1024-NOSPLITSVE-NEXT: movk w0, #59491, lsl #16
+; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #18
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z8
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z9
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z10
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z11
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z12
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z13
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z14
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z15
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa wsp, 1088
+; CHECK1024-NOSPLITSVE-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 0
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w19
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w26
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w27
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w28
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore vg
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w30
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w29
+; CHECK1024-NOSPLITSVE-NEXT: ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_call:
+; CHECK1024-SPLITSVE: // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 64
+; CHECK1024-SPLITSVE-NEXT: cntd x9
+; CHECK1024-SPLITSVE-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: mov x29, sp
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa w29, 64
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w19, -8
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w26, -16
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w27, -24
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w28, -32
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset vg, -48
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w30, -56
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -64
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-2
+; CHECK1024-SPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-16
+; CHECK1024-SPLITSVE-NEXT: str z23, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: mov x8, x0
+; CHECK1024-SPLITSVE-NEXT: bl __arm_sme_state
+; CHECK1024-SPLITSVE-NEXT: mov x19, x0
+; CHECK1024-SPLITSVE-NEXT: //APP
+; CHECK1024-SPLITSVE-NEXT: //NO_APP
+; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB28_2
+; CHECK1024-SPLITSVE-NEXT: // %bb.1: // %entry
+; CHECK1024-SPLITSVE-NEXT: smstop sm
+; CHECK1024-SPLITSVE-NEXT: .LBB28_2: // %entry
+; CHECK1024-SPLITSVE-NEXT: mov x0, x8
+; CHECK1024-SPLITSVE-NEXT: mov w1, #45 // =0x2d
+; CHECK1024-SPLITSVE-NEXT: mov w2, #37 // =0x25
+; CHECK1024-SPLITSVE-NEXT: bl memset
+; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB28_4
+; CHECK1024-SPLITSVE-NEXT: // %bb.3: // %entry
+; CHECK1024-SPLITSVE-NEXT: smstart sm
+; CHECK1024-SPLITSVE-NEXT: .LBB28_4: // %entry
+; CHECK1024-SPLITSVE-NEXT: mov w0, #22647 // =0x5877
+; CHECK1024-SPLITSVE-NEXT: movk w0, #59491, lsl #16
+; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: ldr z23, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #16
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z8
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z9
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z10
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z11
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z12
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z13
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z14
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z15
+; CHECK1024-SPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #2
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa wsp, 64
+; CHECK1024-SPLITSVE-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 0
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w19
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w26
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w27
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w28
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore vg
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w30
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w29
+; CHECK1024-SPLITSVE-NEXT: ret
entry:
tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
%call = call ptr @memset(ptr noundef nonnull %P1, i32 noundef 45, i32 noundef 37)
@@ -2505,138 +2588,267 @@ define i32 @svecc_alloca_call(<4 x i16> %P0, ptr %P1, i32 %P2, <vscale x 16 x i8
; CHECK64-NEXT: .cfi_restore w29
; CHECK64-NEXT: ret
;
-; CHECK1024-LABEL: svecc_alloca_call:
-; CHECK1024: // %bb.0: // %entry
-; CHECK1024-NEXT: sub sp, sp, #1088
-; CHECK1024-NEXT: .cfi_def_cfa_offset 1088
-; CHECK1024-NEXT: cntd x9
-; CHECK1024-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill
-; CHECK1024-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill
-; CHECK1024-NEXT: add x29, sp, #1024
-; CHECK1024-NEXT: .cfi_def_cfa w29, 64
-; CHECK1024-NEXT: .cfi_offset w19, -16
-; CHECK1024-NEXT: .cfi_offset w26, -24
-; CHECK1024-NEXT: .cfi_offset w27, -32
-; CHECK1024-NEXT: .cfi_offset w28, -40
-; CHECK1024-NEXT: .cfi_offset vg, -48
-; CHECK1024-NEXT: .cfi_offset w30, -56
-; CHECK1024-NEXT: .cfi_offset w29, -64
-; CHECK1024-NEXT: addvl sp, sp, #-18
-; CHECK1024-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
-; CHECK1024-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
-; CHECK1024-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
-; CHECK1024-NEXT: sub sp, sp, #1072
-; CHECK1024-NEXT: bl __arm_sme_state
-; CHECK1024-NEXT: mov x19, x0
-; CHECK1024-NEXT: //APP
-; CHECK1024-NEXT: //NO_APP
-; CHECK1024-NEXT: tbz w19, #0, .LBB29_2
-; CHECK1024-NEXT: // %bb.1: // %entry
-; CHECK1024-NEXT: smstop sm
-; CHECK1024-NEXT: .LBB29_2: // %entry
-; CHECK1024-NEXT: mov x0, sp
-; CHECK1024-NEXT: mov w1, #45 // =0x2d
-; CHECK1024-NEXT: mov w2, #37 // =0x25
-; CHECK1024-NEXT: bl memset
-; CHECK1024-NEXT: tbz w19, #0, .LBB29_4
-; CHECK1024-NEXT: // %bb.3: // %entry
-; CHECK1024-NEXT: smstart sm
-; CHECK1024-NEXT: .LBB29_4: // %entry
-; CHECK1024-NEXT: mov w0, #22647 // =0x5877
-; CHECK1024-NEXT: movk w0, #59491, lsl #16
-; CHECK1024-NEXT: add sp, sp, #1072
-; CHECK1024-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
-; CHECK1024-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
-; CHECK1024-NEXT: addvl sp, sp, #18
-; CHECK1024-NEXT: .cfi_restore z8
-; CHECK1024-NEXT: .cfi_restore z9
-; CHECK1024-NEXT: .cfi_restore z10
-; CHECK1024-NEXT: .cfi_restore z11
-; CHECK1024-NEXT: .cfi_restore z12
-; CHECK1024-NEXT: .cfi_restore z13
-; CHECK1024-NEXT: .cfi_restore z14
-; CHECK1024-NEXT: .cfi_restore z15
-; CHECK1024-NEXT: .cfi_def_cfa wsp, 1088
-; CHECK1024-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
-; CHECK1024-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
-; CHECK1024-NEXT: add sp, sp, #1088
-; CHECK1024-NEXT: .cfi_def_cfa_offset 0
-; CHECK1024-NEXT: .cfi_restore w19
-; CHECK1024-NEXT: .cfi_restore w26
-; CHECK1024-NEXT: .cfi_restore w27
-; CHECK1024-NEXT: .cfi_restore w28
-; CHECK1024-NEXT: .cfi_restore vg
-; CHECK1024-NEXT: .cfi_restore w30
-; CHECK1024-NEXT: .cfi_restore w29
-; CHECK1024-NEXT: ret
+; CHECK1024-NOSPLITSVE-LABEL: svecc_alloca_call:
+; CHECK1024-NOSPLITSVE: // %bb.0: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 1088
+; CHECK1024-NOSPLITSVE-NEXT: cntd x9
+; CHECK1024-NOSPLITSVE-NEXT: str x29, [sp, #1024] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x30, [sp, #1032] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x9, [sp, #1040] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x28, [sp, #1048] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x27, [sp, #1056] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x26, [sp, #1064] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str x19, [sp, #1072] // 8-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: add x29, sp, #1024
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa w29, 64
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w19, -16
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w26, -24
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w27, -32
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w28, -40
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset vg, -48
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w30, -56
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_offset w29, -64
+; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #-18
+; CHECK1024-NOSPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z23, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z22, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z21, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z20, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z19, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z18, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z17, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z16, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z15, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z14, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z13, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z12, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z11, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z10, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z9, [sp, #16, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: str z8, [sp, #17, mul vl] // 16-byte Folded Spill
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x78, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 8 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x70, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 16 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-NOSPLITSVE-NEXT: sub sp, sp, #1072
+; CHECK1024-NOSPLITSVE-NEXT: bl __arm_sme_state
+; CHECK1024-NOSPLITSVE-NEXT: mov x19, x0
+; CHECK1024-NOSPLITSVE-NEXT: //APP
+; CHECK1024-NOSPLITSVE-NEXT: //NO_APP
+; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB29_2
+; CHECK1024-NOSPLITSVE-NEXT: // %bb.1: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: smstop sm
+; CHECK1024-NOSPLITSVE-NEXT: .LBB29_2: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: mov x0, sp
+; CHECK1024-NOSPLITSVE-NEXT: mov w1, #45 // =0x2d
+; CHECK1024-NOSPLITSVE-NEXT: mov w2, #37 // =0x25
+; CHECK1024-NOSPLITSVE-NEXT: bl memset
+; CHECK1024-NOSPLITSVE-NEXT: tbz w19, #0, .LBB29_4
+; CHECK1024-NOSPLITSVE-NEXT: // %bb.3: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: smstart sm
+; CHECK1024-NOSPLITSVE-NEXT: .LBB29_4: // %entry
+; CHECK1024-NOSPLITSVE-NEXT: mov w0, #22647 // =0x5877
+; CHECK1024-NOSPLITSVE-NEXT: movk w0, #59491, lsl #16
+; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1072
+; CHECK1024-NOSPLITSVE-NEXT: ldr z23, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z22, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z21, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z20, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z19, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z18, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z17, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z16, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z15, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z14, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z13, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z12, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z11, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z10, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z9, [sp, #16, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr z8, [sp, #17, mul vl] // 16-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: addvl sp, sp, #18
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z8
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z9
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z10
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z11
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z12
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z13
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z14
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore z15
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa wsp, 1088
+; CHECK1024-NOSPLITSVE-NEXT: ldr x19, [sp, #1072] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x26, [sp, #1064] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x27, [sp, #1056] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x28, [sp, #1048] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x30, [sp, #1032] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: ldr x29, [sp, #1024] // 8-byte Folded Reload
+; CHECK1024-NOSPLITSVE-NEXT: add sp, sp, #1088
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_def_cfa_offset 0
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w19
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w26
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w27
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w28
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore vg
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w30
+; CHECK1024-NOSPLITSVE-NEXT: .cfi_restore w29
+; CHECK1024-NOSPLITSVE-NEXT: ret
+;
+; CHECK1024-SPLITSVE-LABEL: svecc_alloca_call:
+; CHECK1024-SPLITSVE: // %bb.0: // %entry
+; CHECK1024-SPLITSVE-NEXT: stp x29, x30, [sp, #-64]! // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 64
+; CHECK1024-SPLITSVE-NEXT: cntd x9
+; CHECK1024-SPLITSVE-NEXT: stp x28, x27, [sp, #32] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str x9, [sp, #16] // 8-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: stp x26, x19, [sp, #48] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: mov x29, sp
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa w29, 64
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w19, -8
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w26, -16
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w27, -24
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w28, -32
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset vg, -48
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w30, -56
+; CHECK1024-SPLITSVE-NEXT: .cfi_offset w29, -64
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-2
+; CHECK1024-SPLITSVE-NEXT: str p15, [sp, #4, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p14, [sp, #5, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p13, [sp, #6, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p12, [sp, #7, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p11, [sp, #8, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p10, [sp, #9, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p9, [sp, #10, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p8, [sp, #11, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p7, [sp, #12, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p6, [sp, #13, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p5, [sp, #14, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str p4, [sp, #15, mul vl] // 2-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #-16
+; CHECK1024-SPLITSVE-NEXT: str z23, [sp] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z22, [sp, #1, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z21, [sp, #2, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z20, [sp, #3, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z19, [sp, #4, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z18, [sp, #5, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z17, [sp, #6, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z16, [sp, #7, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z15, [sp, #8, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z14, [sp, #9, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z13, [sp, #10, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z12, [sp, #11, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z11, [sp, #12, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z10, [sp, #13, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z9, [sp, #14, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: str z8, [sp, #15, mul vl] // 16-byte Folded Spill
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x48, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x68, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d8 @ cfa - 24 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x49, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x60, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d9 @ cfa - 32 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4a, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x58, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d10 @ cfa - 40 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4b, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x50, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d11 @ cfa - 48 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4c, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x48, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d12 @ cfa - 56 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4d, 0x0d, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0x40, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d13 @ cfa - 64 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4e, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb8, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d14 @ cfa - 72 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: .cfi_escape 0x10, 0x4f, 0x0e, 0x12, 0x11, 0x50, 0x22, 0x06, 0x11, 0xb0, 0x7f, 0x1e, 0x22, 0x11, 0xc0, 0x77, 0x22 // $d15 @ cfa - 80 * IncomingVG - 1088
+; CHECK1024-SPLITSVE-NEXT: sub sp, sp, #1072
+; CHECK1024-SPLITSVE-NEXT: bl __arm_sme_state
+; CHECK1024-SPLITSVE-NEXT: mov x19, x0
+; CHECK1024-SPLITSVE-NEXT: //APP
+; CHECK1024-SPLITSVE-NEXT: //NO_APP
+; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB29_2
+; CHECK1024-SPLITSVE-NEXT: // %bb.1: // %entry
+; CHECK1024-SPLITSVE-NEXT: smstop sm
+; CHECK1024-SPLITSVE-NEXT: .LBB29_2: // %entry
+; CHECK1024-SPLITSVE-NEXT: mov x0, sp
+; CHECK1024-SPLITSVE-NEXT: mov w1, #45 // =0x2d
+; CHECK1024-SPLITSVE-NEXT: mov w2, #37 // =0x25
+; CHECK1024-SPLITSVE-NEXT: bl memset
+; CHECK1024-SPLITSVE-NEXT: tbz w19, #0, .LBB29_4
+; CHECK1024-SPLITSVE-NEXT: // %bb.3: // %entry
+; CHECK1024-SPLITSVE-NEXT: smstart sm
+; CHECK1024-SPLITSVE-NEXT: .LBB29_4: // %entry
+; CHECK1024-SPLITSVE-NEXT: mov w0, #22647 // =0x5877
+; CHECK1024-SPLITSVE-NEXT: movk w0, #59491, lsl #16
+; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1072
+; CHECK1024-SPLITSVE-NEXT: ldr z23, [sp] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z22, [sp, #1, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z21, [sp, #2, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z20, [sp, #3, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z19, [sp, #4, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z18, [sp, #5, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z17, [sp, #6, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z16, [sp, #7, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z15, [sp, #8, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z14, [sp, #9, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z13, [sp, #10, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z12, [sp, #11, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z11, [sp, #12, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z10, [sp, #13, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z9, [sp, #14, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr z8, [sp, #15, mul vl] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: add sp, sp, #1024
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #16
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z8
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z9
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z10
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z11
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z12
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z13
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z14
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore z15
+; CHECK1024-SPLITSVE-NEXT: ldr p15, [sp, #4, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p14, [sp, #5, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p13, [sp, #6, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p12, [sp, #7, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p11, [sp, #8, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p10, [sp, #9, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p9, [sp, #10, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p8, [sp, #11, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p7, [sp, #12, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p6, [sp, #13, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p5, [sp, #14, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldr p4, [sp, #15, mul vl] // 2-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: addvl sp, sp, #2
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa wsp, 64
+; CHECK1024-SPLITSVE-NEXT: ldp x26, x19, [sp, #48] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldp x28, x27, [sp, #32] // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: ldp x29, x30, [sp], #64 // 16-byte Folded Reload
+; CHECK1024-SPLITSVE-NEXT: .cfi_def_cfa_offset 0
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w19
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w26
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w27
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w28
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore vg
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w30
+; CHECK1024-SPLITSVE-NEXT: .cfi_restore w29
+; CHECK1024-SPLITSVE-NEXT: ret
entry:
tail call void asm sideeffect "", "~{x0},~{x28},~{x27},~{x3}"() #2
diff --git a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
index 7bddd1d..cc63c7f 100644
--- a/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
+++ b/llvm/test/CodeGen/AArch64/sve-calling-convention-byref.ll
@@ -56,9 +56,9 @@ define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_many_svepred_arg(<
; CHECK: name: caller_with_many_svepred_arg
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 2, alignment: 2,
-; CHECK-NEXT: stack-id: scalable-vector
+; CHECK-NEXT: stack-id: scalable-predicate-vector
; CHECK: - { id: 1, name: '', type: default, offset: 0, size: 2, alignment: 2,
-; CHECK-NEXT: stack-id: scalable-vector
+; CHECK-NEXT: stack-id: scalable-predicate-vector
; CHECK-DAG: STR_PXI %{{[0-9]+}}, %stack.0, 0
; CHECK-DAG: STR_PXI %{{[0-9]+}}, %stack.1, 0
; CHECK-DAG: [[BASE1:%[0-9]+]]:gpr64sp = ADDXri %stack.0, 0
@@ -90,7 +90,7 @@ define aarch64_sve_vector_pcs <vscale x 16 x i1> @caller_with_svepred_arg_1xv16i
; CHECK: name: caller_with_svepred_arg_1xv16i1_4xv16i1
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 2, alignment: 2,
-; CHECK-NEXT: stack-id: scalable-vector,
+; CHECK-NEXT: stack-id: scalable-predicate-vector,
; CHECK: [[PRED0:%[0-9]+]]:ppr = COPY $p0
; CHECK: ADJCALLSTACKDOWN 0, 0, implicit-def dead $sp, implicit $sp
; CHECK: STR_PXI [[PRED0]], %stack.0, 0 :: (store (<vscale x 1 x s16>) into %stack.0)
@@ -139,7 +139,7 @@ define [4 x <vscale x 16 x i1>] @caller_with_svepred_arg_4xv16i1_4xv16i1([4 x <v
; CHECK: name: caller_with_svepred_arg_4xv16i1_4xv16i1
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 2,
-; CHECK-NEXT: stack-id: scalable-vector,
+; CHECK-NEXT: stack-id: scalable-predicate-vector,
; CHECK: [[PRED3:%[0-9]+]]:ppr = COPY $p3
; CHECK: [[PRED2:%[0-9]+]]:ppr = COPY $p2
; CHECK: [[PRED1:%[0-9]+]]:ppr = COPY $p1
@@ -200,7 +200,7 @@ define [2 x <vscale x 32 x i1>] @caller_with_svepred_arg_2xv32i1_1xv16i1([2 x <v
; CHECK: name: caller_with_svepred_arg_2xv32i1_1xv16i1
; CHECK: stack:
; CHECK: - { id: 0, name: '', type: default, offset: 0, size: 8, alignment: 2,
-; CHECK-NEXT: stack-id: scalable-vector,
+; CHECK-NEXT: stack-id: scalable-predicate-vector,
; CHECK: [[PRED3:%[0-9]+]]:ppr = COPY $p3
; CHECK: [[PRED2:%[0-9]+]]:ppr = COPY $p2
; CHECK: [[PRED1:%[0-9]+]]:ppr = COPY $p1
diff --git a/llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll b/llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll
new file mode 100644
index 0000000..584753b
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-load-store-legalisation.ll
@@ -0,0 +1,2854 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc -mattr=+sve < %s | FileCheck %s
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define void @sve_load_store_nxv1i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i8>, ptr %a
+ store <vscale x 1 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv2i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i8>, ptr %a
+ store <vscale x 2 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv3i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i8>, ptr %a
+ store <vscale x 3 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv4i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i8>, ptr %a
+ store <vscale x 4 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv5i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i8>, ptr %a
+ store <vscale x 5 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv6i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ld1b { z1.s }, p1/z, [x0]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1b { z1.s }, p1, [x1]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpkhi z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1b { z0.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i8>, ptr %a
+ store <vscale x 6 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv7i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i8>, ptr %a
+ store <vscale x 7 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv8i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1b { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i8>, ptr %a
+ store <vscale x 8 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv9i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv9i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #9 // =0x9
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 9 x i8>, ptr %a
+ store <vscale x 9 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv10i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv10i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT: ld1b { z1.h }, p1/z, [x0]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: st1b { z0.h }, p1, [x1]
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1b { z1.d }, p0, [x1, #4, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 10 x i8>, ptr %a
+ store <vscale x 10 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv11i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv11i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #11 // =0xb
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 11 x i8>, ptr %a
+ store <vscale x 11 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv12i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv12i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ld1b { z1.h }, p1/z, [x0]
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: st1b { z0.h }, p1, [x1]
+; CHECK-NEXT: st1b { z1.s }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 12 x i8>, ptr %a
+ store <vscale x 12 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv13i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv13i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #13 // =0xd
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 13 x i8>, ptr %a
+ store <vscale x 13 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv14i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv14i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT: ptrue p2.h
+; CHECK-NEXT: ld1b { z1.s }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: ld1b { z1.h }, p2/z, [x0]
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z2.s, z1.h
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: st1b { z0.h }, p2, [x1]
+; CHECK-NEXT: uunpklo z2.d, z2.s
+; CHECK-NEXT: st1b { z1.s }, p1, [x1, #2, mul vl]
+; CHECK-NEXT: st1b { z2.d }, p0, [x1, #6, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 14 x i8>, ptr %a
+ store <vscale x 14 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv15i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv15i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #15 // =0xf
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 15 x i8>, ptr %a
+ store <vscale x 15 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv16i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 16 x i8>, ptr %a
+ store <vscale x 16 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv17i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv17i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #17 // =0x11
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 17 x i8>, ptr %a
+ store <vscale x 17 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv18i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv18i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x8]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z1.s, z0.h
+; CHECK-NEXT: uunpkhi z0.s, z0.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpkhi z1.d, z1.s
+; CHECK-NEXT: uzp1 z1.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpklo z2.s, z1.h
+; CHECK-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEXT: uzp1 z2.s, z0.s, z2.s
+; CHECK-NEXT: uzp1 z1.h, z2.h, z1.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpklo z2.s, z1.h
+; CHECK-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEXT: uunpklo z2.d, z2.s
+; CHECK-NEXT: uzp1 z2.s, z2.s, z0.s
+; CHECK-NEXT: uzp1 z1.h, z2.h, z1.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpkhi z2.s, z1.h
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpkhi z2.d, z2.s
+; CHECK-NEXT: uzp1 z2.s, z0.s, z2.s
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpkhi z2.s, z1.h
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z2.d, z2.s
+; CHECK-NEXT: uzp1 z2.s, z2.s, z0.s
+; CHECK-NEXT: uzp1 z1.h, z1.h, z2.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1b { z0.d }, p0, [x1, x8]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 18 x i8>, ptr %a
+ store <vscale x 18 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv19i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv19i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #19 // =0x13
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 19 x i8>, ptr %a
+ store <vscale x 19 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv20i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv20i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1b { z1.s }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEXT: uzp1 z1.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: st1b { z0.s }, p0, [x1, #4, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 20 x i8>, ptr %a
+ store <vscale x 20 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv21i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv21i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #21 // =0x15
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 21 x i8>, ptr %a
+ store <vscale x 21 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv22i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv22i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: cntw x8, all, mul #5
+; CHECK-NEXT: ldr z2, [x0]
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT: ld1b { z1.d }, p1/z, [x0, x8]
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uzp1 z1.s, z1.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEXT: uzp1 z1.h, z0.h, z1.h
+; CHECK-NEXT: uzp1 z1.b, z0.b, z1.b
+; CHECK-NEXT: uunpkhi z1.h, z1.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uzp1 z1.h, z1.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z0.b, z1.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1b { z1.d }, p1, [x1, x8]
+; CHECK-NEXT: st1b { z0.s }, p0, [x1, #4, mul vl]
+; CHECK-NEXT: str z2, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 22 x i8>, ptr %a
+ store <vscale x 22 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv23i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv23i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #23 // =0x17
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 23 x i8>, ptr %a
+ store <vscale x 23 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv24i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv24i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1b { z1.h }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: st1b { z0.h }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 24 x i8>, ptr %a
+ store <vscale x 24 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv25i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv25i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #25 // =0x19
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 25 x i8>, ptr %a
+ store <vscale x 25 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv26i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv26i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: cnth x8, all, mul #3
+; CHECK-NEXT: ldr z2, [x0]
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x8]
+; CHECK-NEXT: ld1b { z1.h }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1b { z1.d }, p0, [x1, x8]
+; CHECK-NEXT: st1b { z0.h }, p1, [x1, #2, mul vl]
+; CHECK-NEXT: str z2, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 26 x i8>, ptr %a
+ store <vscale x 26 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv27i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv27i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #27 // =0x1b
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 27 x i8>, ptr %a
+ store <vscale x 27 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv28i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv28i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldr z2, [x0]
+; CHECK-NEXT: ptrue p1.h
+; CHECK-NEXT: ld1b { z0.s }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT: ld1b { z1.h }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z2, [x1]
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z1.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: st1b { z0.h }, p1, [x1, #2, mul vl]
+; CHECK-NEXT: st1b { z1.s }, p0, [x1, #6, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 28 x i8>, ptr %a
+ store <vscale x 28 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv29i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv29i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #29 // =0x1d
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 29 x i8>, ptr %a
+ store <vscale x 29 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv30i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv30i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: cntw x8, all, mul #7
+; CHECK-NEXT: ldr z3, [x0]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1b { z0.d }, p0/z, [x0, x8]
+; CHECK-NEXT: ptrue p2.h
+; CHECK-NEXT: ld1b { z1.s }, p1/z, [x0, #6, mul vl]
+; CHECK-NEXT: ld1b { z2.h }, p2/z, [x0, #2, mul vl]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uzp1 z0.b, z2.b, z0.b
+; CHECK-NEXT: uunpkhi z1.h, z0.b
+; CHECK-NEXT: uunpklo z0.h, z0.b
+; CHECK-NEXT: uunpkhi z2.s, z1.h
+; CHECK-NEXT: uunpklo z1.s, z1.h
+; CHECK-NEXT: uunpklo z2.d, z2.s
+; CHECK-NEXT: st1b { z2.d }, p0, [x1, x8]
+; CHECK-NEXT: st1b { z0.h }, p2, [x1, #2, mul vl]
+; CHECK-NEXT: st1b { z1.s }, p1, [x1, #6, mul vl]
+; CHECK-NEXT: str z3, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 30 x i8>, ptr %a
+ store <vscale x 30 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv31i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv31i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w10, #31 // =0x1f
+; CHECK-NEXT: lsr x9, x8, #4
+; CHECK-NEXT: mul x9, x9, x10
+; CHECK-NEXT: whilelo p0.b, x8, x9
+; CHECK-NEXT: whilelo p1.b, xzr, x9
+; CHECK-NEXT: ld1b { z0.b }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1b { z1.b }, p1/z, [x0]
+; CHECK-NEXT: st1b { z0.b }, p0, [x1, #1, mul vl]
+; CHECK-NEXT: st1b { z1.b }, p1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 31 x i8>, ptr %a
+ store <vscale x 31 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv32i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv32i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 32 x i8>, ptr %a
+ store <vscale x 32 x i8> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv1i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i16>, ptr %a
+ store <vscale x 1 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv2i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i16>, ptr %a
+ store <vscale x 2 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv3i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i16>, ptr %a
+ store <vscale x 3 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv4i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i16>, ptr %a
+ store <vscale x 4 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv5i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i16>, ptr %a
+ store <vscale x 5 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv6i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1h { z0.s }, p1, [x1]
+; CHECK-NEXT: st1h { z1.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i16>, ptr %a
+ store <vscale x 6 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv7i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i16>, ptr %a
+ store <vscale x 7 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv8i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i16>, ptr %a
+ store <vscale x 8 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv9i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv9i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #9 // =0x9
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 9 x i16>, ptr %a
+ store <vscale x 9 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv10i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv10i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z1.h, z0.h, z0.h
+; CHECK-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEXT: uunpkhi z1.d, z1.s
+; CHECK-NEXT: uzp1 z1.s, z0.s, z1.s
+; CHECK-NEXT: uzp1 z1.h, z0.h, z1.h
+; CHECK-NEXT: uunpkhi z1.s, z1.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: uzp1 z1.s, z1.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z0.h, z1.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [x1, #4, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 10 x i16>, ptr %a
+ store <vscale x 10 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv11i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv11i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #11 // =0xb
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 11 x i16>, ptr %a
+ store <vscale x 11 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv12i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv12i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: st1h { z0.s }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 12 x i16>, ptr %a
+ store <vscale x 12 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv13i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv13i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #13 // =0xd
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 13 x i16>, ptr %a
+ store <vscale x 13 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv14i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv14i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z2, [x0]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z2, [x1]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1h { z0.s }, p1, [x1, #2, mul vl]
+; CHECK-NEXT: st1h { z1.d }, p0, [x1, #6, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 14 x i16>, ptr %a
+ store <vscale x 14 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv15i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv15i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #15 // =0xf
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 15 x i16>, ptr %a
+ store <vscale x 15 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv16i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv16i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 16 x i16>, ptr %a
+ store <vscale x 16 x i16> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv1i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i32>, ptr %a
+ store <vscale x 1 x i32> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv2i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i32>, ptr %a
+ store <vscale x 2 x i32> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv3i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i32>, ptr %a
+ store <vscale x 3 x i32> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv4i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i32>, ptr %a
+ store <vscale x 4 x i32> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv5i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i32>, ptr %a
+ store <vscale x 5 x i32> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv6i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: uzp1 z0.s, z1.s, z0.s
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1w { z0.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i32>, ptr %a
+ store <vscale x 6 x i32> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv7i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i32>, ptr %a
+ store <vscale x 7 x i32> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv8i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i32>, ptr %a
+ store <vscale x 8 x i32> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv1i64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i64>, ptr %a
+ store <vscale x 1 x i64> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv2i64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i64>, ptr %a
+ store <vscale x 2 x i64> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv3i64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1d { z1.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i64>, ptr %a
+ store <vscale x 3 x i64> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv4i64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4i64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i64>, ptr %a
+ store <vscale x 4 x i64> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv1f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x half>, ptr %a
+ store <vscale x 1 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv2f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x half>, ptr %a
+ store <vscale x 2 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv3f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x half>, ptr %a
+ store <vscale x 3 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv4f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x half>, ptr %a
+ store <vscale x 4 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv5f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x half>, ptr %a
+ store <vscale x 5 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv6f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1h { z1.s }, p1, [x1]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpkhi z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x half>, ptr %a
+ store <vscale x 6 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv7f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x half>, ptr %a
+ store <vscale x 7 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv8f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x half>, ptr %a
+ store <vscale x 8 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv9f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv9f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #9 // =0x9
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 9 x half>, ptr %a
+ store <vscale x 9 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv10f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv10f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: st1h { z1.d }, p0, [x1, #4, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 10 x half>, ptr %a
+ store <vscale x 10 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv11f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv11f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #11 // =0xb
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 11 x half>, ptr %a
+ store <vscale x 11 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv12f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv12f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: st1h { z1.s }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 12 x half>, ptr %a
+ store <vscale x 12 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv13f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv13f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #13 // =0xd
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 13 x half>, ptr %a
+ store <vscale x 13 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv14f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv14f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z2, [x0]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z2, [x1]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1h { z1.s }, p1, [x1, #2, mul vl]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpkhi z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [x1, #6, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 14 x half>, ptr %a
+ store <vscale x 14 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv15f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv15f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #15 // =0xf
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 15 x half>, ptr %a
+ store <vscale x 15 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv16f16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv16f16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 16 x half>, ptr %a
+ store <vscale x 16 x half> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv1f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x float>, ptr %a
+ store <vscale x 1 x float> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv2f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1w { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x float>, ptr %a
+ store <vscale x 2 x float> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv3f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1w { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x float>, ptr %a
+ store <vscale x 3 x float> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv4f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x float>, ptr %a
+ store <vscale x 4 x float> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv5f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x float>, ptr %a
+ store <vscale x 5 x float> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv6f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1w { z1.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: st1w { z1.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x float>, ptr %a
+ store <vscale x 6 x float> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv7f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1w { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1w { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x float>, ptr %a
+ store <vscale x 7 x float> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv8f32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8f32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x float>, ptr %a
+ store <vscale x 8 x float> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv1f64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1d { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x double>, ptr %a
+ store <vscale x 1 x double> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv2f64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x double>, ptr %a
+ store <vscale x 2 x double> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv3f64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1d { z0.d }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1d { z1.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1d { z1.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x double>, ptr %a
+ store <vscale x 3 x double> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv4f64(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4f64:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x double>, ptr %a
+ store <vscale x 4 x double> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv1bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv1bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x bfloat>, ptr %a
+ store <vscale x 1 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv2bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv2bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.d }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x bfloat>, ptr %a
+ store <vscale x 2 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv3bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv3bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x bfloat>, ptr %a
+ store <vscale x 3 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv4bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv4bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1h { z0.s }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.s }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x bfloat>, ptr %a
+ store <vscale x 4 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv5bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv5bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x bfloat>, ptr %a
+ store <vscale x 5 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv6bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv6bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1h { z1.s }, p1, [x1]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpkhi z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x bfloat>, ptr %a
+ store <vscale x 6 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv7bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv7bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1h { z0.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x bfloat>, ptr %a
+ store <vscale x 7 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv8bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv8bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x bfloat>, ptr %a
+ store <vscale x 8 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv9bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv9bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #9 // =0x9
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 9 x bfloat>, ptr %a
+ store <vscale x 9 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv10bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv10bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1h { z1.d }, p0/z, [x0, #4, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: st1h { z1.d }, p0, [x1, #4, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 10 x bfloat>, ptr %a
+ store <vscale x 10 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv11bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv11bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #11 // =0xb
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 11 x bfloat>, ptr %a
+ store <vscale x 11 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv12bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv12bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ldr z0, [x0]
+; CHECK-NEXT: ld1h { z1.s }, p0/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z0, [x1]
+; CHECK-NEXT: st1h { z1.s }, p0, [x1, #2, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 12 x bfloat>, ptr %a
+ store <vscale x 12 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv13bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv13bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #13 // =0xd
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 13 x bfloat>, ptr %a
+ store <vscale x 13 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv14bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv14bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ldr z2, [x0]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1h { z0.d }, p0/z, [x0, #6, mul vl]
+; CHECK-NEXT: ld1h { z1.s }, p1/z, [x0, #2, mul vl]
+; CHECK-NEXT: str z2, [x1]
+; CHECK-NEXT: uzp1 z0.s, z0.s, z0.s
+; CHECK-NEXT: st1h { z1.s }, p1, [x1, #2, mul vl]
+; CHECK-NEXT: uzp1 z0.h, z1.h, z0.h
+; CHECK-NEXT: uunpkhi z0.s, z0.h
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [x1, #6, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 14 x bfloat>, ptr %a
+ store <vscale x 14 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv15bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv15bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #15 // =0xf
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1h { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1h { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [x1, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 15 x bfloat>, ptr %a
+ store <vscale x 15 x bfloat> %c, ptr %b
+ ret void
+}
+
+define void @sve_load_store_nxv16bf16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_load_store_nxv16bf16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ldr z0, [x0, #1, mul vl]
+; CHECK-NEXT: ldr z1, [x0]
+; CHECK-NEXT: str z0, [x1, #1, mul vl]
+; CHECK-NEXT: str z1, [x1]
+; CHECK-NEXT: ret
+ %c = load <vscale x 16 x bfloat>, ptr %a
+ store <vscale x 16 x bfloat> %c, ptr %b
+ ret void
+}
+
+define <vscale x 1 x i16> @sve_sextload_nxv1i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv1i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i8>, ptr %a
+ %c.sext = sext <vscale x 1 x i8> %c to <vscale x 1 x i16>
+ ret <vscale x 1 x i16> %c.sext
+}
+
+define <vscale x 2 x i16> @sve_sextload_nxv2i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i8>, ptr %a
+ %c.sext = sext <vscale x 2 x i8> %c to <vscale x 2 x i16>
+ ret <vscale x 2 x i16> %c.sext
+}
+
+define <vscale x 3 x i16> @sve_sextload_nxv3i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv3i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i8>, ptr %a
+ %c.sext = sext <vscale x 3 x i8> %c to <vscale x 3 x i16>
+ ret <vscale x 3 x i16> %c.sext
+}
+
+define <vscale x 4 x i16> @sve_sextload_nxv4i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv4i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i8>, ptr %a
+ %c.sext = sext <vscale x 4 x i8> %c to <vscale x 4 x i16>
+ ret <vscale x 4 x i16> %c.sext
+}
+
+define <vscale x 5 x i16> @sve_sextload_nxv5i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv5i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i8>, ptr %a
+ %c.sext = sext <vscale x 5 x i8> %c to <vscale x 5 x i16>
+ ret <vscale x 5 x i16> %c.sext
+}
+
+define <vscale x 6 x i16> @sve_sextload_nxv6i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv6i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntd x8, all, mul #3
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i8>, ptr %a
+ %c.sext = sext <vscale x 6 x i8> %c to <vscale x 6 x i16>
+ ret <vscale x 6 x i16> %c.sext
+}
+
+define <vscale x 7 x i16> @sve_sextload_nxv7i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv7i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i8>, ptr %a
+ %c.sext = sext <vscale x 7 x i8> %c to <vscale x 7 x i16>
+ ret <vscale x 7 x i16> %c.sext
+}
+
+define <vscale x 8 x i16> @sve_sextload_nxv8i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i8>, ptr %a
+ %c.sext = sext <vscale x 8 x i8> %c to <vscale x 8 x i16>
+ ret <vscale x 8 x i16> %c.sext
+}
+
+define <vscale x 9 x i16> @sve_sextload_nxv9i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv9i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #9 // =0x9
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 9 x i8>, ptr %a
+ %c.sext = sext <vscale x 9 x i8> %c to <vscale x 9 x i16>
+ ret <vscale x 9 x i16> %c.sext
+}
+
+define <vscale x 10 x i16> @sve_sextload_nxv10i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv10i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8, all, mul #5
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: str z1, [sp]
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [sp, #4, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 10 x i8>, ptr %a
+ %c.sext = sext <vscale x 10 x i8> %c to <vscale x 10 x i16>
+ ret <vscale x 10 x i16> %c.sext
+}
+
+define <vscale x 11 x i16> @sve_sextload_nxv11i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv11i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #11 // =0xb
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 11 x i8>, ptr %a
+ %c.sext = sext <vscale x 11 x i8> %c to <vscale x 11 x i16>
+ ret <vscale x 11 x i16> %c.sext
+}
+
+define <vscale x 12 x i16> @sve_sextload_nxv12i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv12i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntw x8, all, mul #3
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: str z1, [sp]
+; CHECK-NEXT: st1h { z0.s }, p1, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 12 x i8>, ptr %a
+ %c.sext = sext <vscale x 12 x i8> %c to <vscale x 12 x i16>
+ ret <vscale x 12 x i16> %c.sext
+}
+
+define <vscale x 13 x i16> @sve_sextload_nxv13i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv13i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #13 // =0xd
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 13 x i8>, ptr %a
+ %c.sext = sext <vscale x 13 x i8> %c to <vscale x 13 x i16>
+ ret <vscale x 13 x i16> %c.sext
+}
+
+define <vscale x 14 x i16> @sve_sextload_nxv14i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv14i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8, all, mul #7
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1sb { z2.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: str z2, [sp]
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1h { z0.s }, p1, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: st1h { z1.d }, p0, [sp, #6, mul vl]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 14 x i8>, ptr %a
+ %c.sext = sext <vscale x 14 x i8> %c to <vscale x 14 x i16>
+ ret <vscale x 14 x i16> %c.sext
+}
+
+define <vscale x 15 x i16> @sve_sextload_nxv15i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv15i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #15 // =0xf
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 15 x i8>, ptr %a
+ %c.sext = sext <vscale x 15 x i8> %c to <vscale x 15 x i16>
+ ret <vscale x 15 x i16> %c.sext
+}
+
+define <vscale x 16 x i16> @sve_sextload_nxv16i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 16 x i8>, ptr %a
+ %c.sext = sext <vscale x 16 x i8> %c to <vscale x 16 x i16>
+ ret <vscale x 16 x i16> %c.sext
+}
+
+define <vscale x 1 x i32> @sve_sextload_nxv1i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv1i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i16>, ptr %a
+ %c.sext = sext <vscale x 1 x i16> %c to <vscale x 1 x i32>
+ ret <vscale x 1 x i32> %c.sext
+}
+
+define <vscale x 2 x i32> @sve_sextload_nxv2i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i16>, ptr %a
+ %c.sext = sext <vscale x 2 x i16> %c to <vscale x 2 x i32>
+ ret <vscale x 2 x i32> %c.sext
+}
+
+define <vscale x 3 x i32> @sve_sextload_nxv3i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv3i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i16>, ptr %a
+ %c.sext = sext <vscale x 3 x i16> %c to <vscale x 3 x i32>
+ ret <vscale x 3 x i32> %c.sext
+}
+
+define <vscale x 4 x i32> @sve_sextload_nxv4i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i16>, ptr %a
+ %c.sext = sext <vscale x 4 x i16> %c to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %c.sext
+}
+
+define <vscale x 5 x i32> @sve_sextload_nxv5i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv5i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i16>, ptr %a
+ %c.sext = sext <vscale x 5 x i16> %c to <vscale x 5 x i32>
+ ret <vscale x 5 x i32> %c.sext
+}
+
+define <vscale x 6 x i32> @sve_sextload_nxv6i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv6i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8, all, mul #3
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: str z1, [sp]
+; CHECK-NEXT: st1w { z0.d }, p1, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i16>, ptr %a
+ %c.sext = sext <vscale x 6 x i16> %c to <vscale x 6 x i32>
+ ret <vscale x 6 x i32> %c.sext
+}
+
+define <vscale x 7 x i32> @sve_sextload_nxv7i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv7i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i16>, ptr %a
+ %c.sext = sext <vscale x 7 x i16> %c to <vscale x 7 x i32>
+ ret <vscale x 7 x i32> %c.sext
+}
+
+define <vscale x 8 x i32> @sve_sextload_nxv8i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i16>, ptr %a
+ %c.sext = sext <vscale x 8 x i16> %c to <vscale x 8 x i32>
+ ret <vscale x 8 x i32> %c.sext
+}
+
+define <vscale x 1 x i64> @sve_sextload_nxv1i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv1i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i32>, ptr %a
+ %c.sext = sext <vscale x 1 x i32> %c to <vscale x 1 x i64>
+ ret <vscale x 1 x i64> %c.sext
+}
+
+define <vscale x 2 x i64> @sve_sextload_nxv2i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i32>, ptr %a
+ %c.sext = sext <vscale x 2 x i32> %c to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %c.sext
+}
+
+define <vscale x 3 x i64> @sve_sextload_nxv3i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv3i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sw { z0.d }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1d { z1.d }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i32>, ptr %a
+ %c.sext = sext <vscale x 3 x i32> %c to <vscale x 3 x i64>
+ ret <vscale x 3 x i64> %c.sext
+}
+
+define <vscale x 4 x i64> @sve_sextload_nxv4i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_sextload_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i32>, ptr %a
+ %c.sext = sext <vscale x 4 x i32> %c to <vscale x 4 x i64>
+ ret <vscale x 4 x i64> %c.sext
+}
+
+define <vscale x 1 x i16> @sve_zextload_nxv1i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv1i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i8>, ptr %a
+ %c.zext = sext <vscale x 1 x i8> %c to <vscale x 1 x i16>
+ ret <vscale x 1 x i16> %c.zext
+}
+
+define <vscale x 2 x i16> @sve_zextload_nxv2i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv2i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sb { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i8>, ptr %a
+ %c.zext = sext <vscale x 2 x i8> %c to <vscale x 2 x i16>
+ ret <vscale x 2 x i16> %c.zext
+}
+
+define <vscale x 3 x i16> @sve_zextload_nxv3i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv3i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i8>, ptr %a
+ %c.zext = sext <vscale x 3 x i8> %c to <vscale x 3 x i16>
+ ret <vscale x 3 x i16> %c.zext
+}
+
+define <vscale x 4 x i16> @sve_zextload_nxv4i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv4i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1sb { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i8>, ptr %a
+ %c.zext = sext <vscale x 4 x i8> %c to <vscale x 4 x i16>
+ ret <vscale x 4 x i16> %c.zext
+}
+
+define <vscale x 5 x i16> @sve_zextload_nxv5i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv5i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i8>, ptr %a
+ %c.zext = sext <vscale x 5 x i8> %c to <vscale x 5 x i16>
+ ret <vscale x 5 x i16> %c.zext
+}
+
+define <vscale x 6 x i16> @sve_zextload_nxv6i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv6i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: cntd x8, all, mul #3
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i8>, ptr %a
+ %c.zext = sext <vscale x 6 x i8> %c to <vscale x 6 x i16>
+ ret <vscale x 6 x i16> %c.zext
+}
+
+define <vscale x 7 x i16> @sve_zextload_nxv7i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv7i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i8>, ptr %a
+ %c.zext = sext <vscale x 7 x i8> %c to <vscale x 7 x i16>
+ ret <vscale x 7 x i16> %c.zext
+}
+
+define <vscale x 8 x i16> @sve_zextload_nxv8i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv8i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i8>, ptr %a
+ %c.zext = sext <vscale x 8 x i8> %c to <vscale x 8 x i16>
+ ret <vscale x 8 x i16> %c.zext
+}
+
+define <vscale x 9 x i16> @sve_zextload_nxv9i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv9i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #9 // =0x9
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 9 x i8>, ptr %a
+ %c.zext = sext <vscale x 9 x i8> %c to <vscale x 9 x i16>
+ ret <vscale x 9 x i16> %c.zext
+}
+
+define <vscale x 10 x i16> @sve_zextload_nxv10i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv10i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8, all, mul #5
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: str z1, [sp]
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: st1h { z0.d }, p0, [sp, #4, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 10 x i8>, ptr %a
+ %c.zext = sext <vscale x 10 x i8> %c to <vscale x 10 x i16>
+ ret <vscale x 10 x i16> %c.zext
+}
+
+define <vscale x 11 x i16> @sve_zextload_nxv11i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv11i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #11 // =0xb
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 11 x i8>, ptr %a
+ %c.zext = sext <vscale x 11 x i8> %c to <vscale x 11 x i16>
+ ret <vscale x 11 x i16> %c.zext
+}
+
+define <vscale x 12 x i16> @sve_zextload_nxv12i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv12i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntw x8, all, mul #3
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: str z1, [sp]
+; CHECK-NEXT: st1h { z0.s }, p1, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 12 x i8>, ptr %a
+ %c.zext = sext <vscale x 12 x i8> %c to <vscale x 12 x i16>
+ ret <vscale x 12 x i16> %c.zext
+}
+
+define <vscale x 13 x i16> @sve_zextload_nxv13i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv13i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #13 // =0xd
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 13 x i8>, ptr %a
+ %c.zext = sext <vscale x 13 x i8> %c to <vscale x 13 x i16>
+ ret <vscale x 13 x i16> %c.zext
+}
+
+define <vscale x 14 x i16> @sve_zextload_nxv14i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv14i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8, all, mul #7
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ptrue p1.s
+; CHECK-NEXT: ld1sb { z2.h }, p0/z, [x0]
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: uunpkhi z1.s, z0.h
+; CHECK-NEXT: uunpklo z0.s, z0.h
+; CHECK-NEXT: str z2, [sp]
+; CHECK-NEXT: uunpklo z1.d, z1.s
+; CHECK-NEXT: st1h { z0.s }, p1, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: st1h { z1.d }, p0, [sp, #6, mul vl]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 14 x i8>, ptr %a
+ %c.zext = sext <vscale x 14 x i8> %c to <vscale x 14 x i16>
+ ret <vscale x 14 x i16> %c.zext
+}
+
+define <vscale x 15 x i16> @sve_zextload_nxv15i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv15i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #15 // =0xf
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.b, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sb { z0.h }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0]
+; CHECK-NEXT: st1h { z0.h }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1h { z1.h }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 15 x i8>, ptr %a
+ %c.zext = sext <vscale x 15 x i8> %c to <vscale x 15 x i16>
+ ret <vscale x 15 x i16> %c.zext
+}
+
+define <vscale x 16 x i16> @sve_zextload_nxv16i8(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv16i8:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.h
+; CHECK-NEXT: ld1sb { z0.h }, p0/z, [x0]
+; CHECK-NEXT: ld1sb { z1.h }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 16 x i8>, ptr %a
+ %c.zext = sext <vscale x 16 x i8> %c to <vscale x 16 x i16>
+ ret <vscale x 16 x i16> %c.zext
+}
+
+define <vscale x 1 x i32> @sve_zextload_nxv1i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv1i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i16>, ptr %a
+ %c.zext = sext <vscale x 1 x i16> %c to <vscale x 1 x i32>
+ ret <vscale x 1 x i32> %c.zext
+}
+
+define <vscale x 2 x i32> @sve_zextload_nxv2i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv2i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sh { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i16>, ptr %a
+ %c.zext = sext <vscale x 2 x i16> %c to <vscale x 2 x i32>
+ ret <vscale x 2 x i32> %c.zext
+}
+
+define <vscale x 3 x i32> @sve_zextload_nxv3i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv3i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i16>, ptr %a
+ %c.zext = sext <vscale x 3 x i16> %c to <vscale x 3 x i32>
+ ret <vscale x 3 x i32> %c.zext
+}
+
+define <vscale x 4 x i32> @sve_zextload_nxv4i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv4i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i16>, ptr %a
+ %c.zext = sext <vscale x 4 x i16> %c to <vscale x 4 x i32>
+ ret <vscale x 4 x i32> %c.zext
+}
+
+define <vscale x 5 x i32> @sve_zextload_nxv5i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv5i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #5 // =0x5
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 5 x i16>, ptr %a
+ %c.zext = sext <vscale x 5 x i16> %c to <vscale x 5 x i32>
+ ret <vscale x 5 x i32> %c.zext
+}
+
+define <vscale x 6 x i32> @sve_zextload_nxv6i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv6i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: cntd x8, all, mul #3
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ptrue p1.d
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT: uunpklo z0.d, z0.s
+; CHECK-NEXT: str z1, [sp]
+; CHECK-NEXT: st1w { z0.d }, p1, [sp, #2, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 6 x i16>, ptr %a
+ %c.zext = sext <vscale x 6 x i16> %c to <vscale x 6 x i32>
+ ret <vscale x 6 x i32> %c.zext
+}
+
+define <vscale x 7 x i32> @sve_zextload_nxv7i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv7i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #7 // =0x7
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.h, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sh { z0.s }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0]
+; CHECK-NEXT: st1w { z0.s }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1w { z1.s }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 7 x i16>, ptr %a
+ %c.zext = sext <vscale x 7 x i16> %c to <vscale x 7 x i32>
+ ret <vscale x 7 x i32> %c.zext
+}
+
+define <vscale x 8 x i32> @sve_zextload_nxv8i16(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv8i16:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.s
+; CHECK-NEXT: ld1sh { z0.s }, p0/z, [x0]
+; CHECK-NEXT: ld1sh { z1.s }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 8 x i16>, ptr %a
+ %c.zext = sext <vscale x 8 x i16> %c to <vscale x 8 x i32>
+ ret <vscale x 8 x i32> %c.zext
+}
+
+define <vscale x 1 x i64> @sve_zextload_nxv1i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv1i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: whilelo p0.d, xzr, x8
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 1 x i32>, ptr %a
+ %c.zext = sext <vscale x 1 x i32> %c to <vscale x 1 x i64>
+ ret <vscale x 1 x i64> %c.zext
+}
+
+define <vscale x 2 x i64> @sve_zextload_nxv2i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv2i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ret
+ %c = load <vscale x 2 x i32>, ptr %a
+ %c.zext = sext <vscale x 2 x i32> %c to <vscale x 2 x i64>
+ ret <vscale x 2 x i64> %c.zext
+}
+
+define <vscale x 3 x i64> @sve_zextload_nxv3i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv3i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: str x29, [sp, #-16]! // 8-byte Folded Spill
+; CHECK-NEXT: addvl sp, sp, #-2
+; CHECK-NEXT: .cfi_escape 0x0f, 0x08, 0x8f, 0x10, 0x92, 0x2e, 0x00, 0x40, 0x1e, 0x22 // sp + 16 + 16 * VG
+; CHECK-NEXT: .cfi_offset w29, -16
+; CHECK-NEXT: rdvl x8, #1
+; CHECK-NEXT: mov w9, #3 // =0x3
+; CHECK-NEXT: lsr x8, x8, #4
+; CHECK-NEXT: mul x8, x8, x9
+; CHECK-NEXT: whilelo p0.s, xzr, x8
+; CHECK-NEXT: punpkhi p1.h, p0.b
+; CHECK-NEXT: punpklo p0.h, p0.b
+; CHECK-NEXT: ld1sw { z0.d }, p1/z, [x0, #1, mul vl]
+; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0]
+; CHECK-NEXT: st1d { z0.d }, p1, [sp, #1, mul vl]
+; CHECK-NEXT: st1d { z1.d }, p0, [sp]
+; CHECK-NEXT: ldr z1, [sp, #1, mul vl]
+; CHECK-NEXT: ldr z0, [sp]
+; CHECK-NEXT: addvl sp, sp, #2
+; CHECK-NEXT: ldr x29, [sp], #16 // 8-byte Folded Reload
+; CHECK-NEXT: ret
+ %c = load <vscale x 3 x i32>, ptr %a
+ %c.zext = sext <vscale x 3 x i32> %c to <vscale x 3 x i64>
+ ret <vscale x 3 x i64> %c.zext
+}
+
+define <vscale x 4 x i64> @sve_zextload_nxv4i32(ptr %a, ptr %b) {
+; CHECK-LABEL: sve_zextload_nxv4i32:
+; CHECK: // %bb.0:
+; CHECK-NEXT: ptrue p0.d
+; CHECK-NEXT: ld1sw { z0.d }, p0/z, [x0]
+; CHECK-NEXT: ld1sw { z1.d }, p0/z, [x0, #1, mul vl]
+; CHECK-NEXT: ret
+ %c = load <vscale x 4 x i32>, ptr %a
+ %c.zext = sext <vscale x 4 x i32> %c to <vscale x 4 x i64>
+ ret <vscale x 4 x i64> %c.zext
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
index 2cbb29e..d8de12c 100644
--- a/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
+++ b/llvm/test/CodeGen/AArch64/sve-stack-frame-layout.ll
@@ -672,5 +672,3 @@ entry:
ret i32 %x
}
declare void @other()
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK-FRAMELAYOUT: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
index 9b4539c..10d61de 100644
--- a/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
+++ b/llvm/test/CodeGen/AMDGPU/identical-subrange-spill-infloop.ll
@@ -6,96 +6,134 @@ define void @main(i1 %arg) #0 {
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 ; 4-byte Folded Spill
-; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 ; 4-byte Folded Spill
+; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
-; CHECK-NEXT: v_writelane_b32 v5, s30, 0
-; CHECK-NEXT: v_writelane_b32 v5, s31, 1
-; CHECK-NEXT: v_writelane_b32 v5, s36, 2
-; CHECK-NEXT: v_writelane_b32 v5, s37, 3
-; CHECK-NEXT: v_writelane_b32 v5, s38, 4
-; CHECK-NEXT: v_writelane_b32 v5, s39, 5
-; CHECK-NEXT: v_writelane_b32 v5, s48, 6
-; CHECK-NEXT: v_writelane_b32 v5, s49, 7
-; CHECK-NEXT: v_writelane_b32 v5, s50, 8
-; CHECK-NEXT: v_writelane_b32 v5, s51, 9
-; CHECK-NEXT: v_writelane_b32 v5, s52, 10
-; CHECK-NEXT: v_writelane_b32 v5, s53, 11
-; CHECK-NEXT: v_writelane_b32 v5, s54, 12
-; CHECK-NEXT: v_writelane_b32 v5, s55, 13
-; CHECK-NEXT: s_getpc_b64 s[24:25]
-; CHECK-NEXT: v_writelane_b32 v5, s64, 14
-; CHECK-NEXT: s_movk_i32 s4, 0xf0
-; CHECK-NEXT: s_mov_b32 s5, s24
-; CHECK-NEXT: v_writelane_b32 v5, s65, 15
-; CHECK-NEXT: s_load_dwordx16 s[8:23], s[4:5], 0x0
-; CHECK-NEXT: s_mov_b64 s[4:5], 0
-; CHECK-NEXT: v_writelane_b32 v5, s66, 16
-; CHECK-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x0
-; CHECK-NEXT: v_writelane_b32 v5, s67, 17
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_movk_i32 s6, 0x130
-; CHECK-NEXT: s_mov_b32 s7, s24
-; CHECK-NEXT: v_writelane_b32 v5, s68, 18
-; CHECK-NEXT: s_load_dwordx16 s[36:51], s[6:7], 0x0
-; CHECK-NEXT: v_writelane_b32 v5, s69, 19
-; CHECK-NEXT: v_writelane_b32 v5, s70, 20
+; CHECK-NEXT: v_writelane_b32 v6, s30, 0
+; CHECK-NEXT: v_writelane_b32 v6, s31, 1
+; CHECK-NEXT: v_writelane_b32 v6, s36, 2
+; CHECK-NEXT: v_writelane_b32 v6, s37, 3
+; CHECK-NEXT: v_writelane_b32 v6, s38, 4
+; CHECK-NEXT: v_writelane_b32 v6, s39, 5
+; CHECK-NEXT: v_writelane_b32 v6, s48, 6
+; CHECK-NEXT: v_writelane_b32 v6, s49, 7
+; CHECK-NEXT: v_writelane_b32 v6, s50, 8
+; CHECK-NEXT: v_writelane_b32 v6, s51, 9
+; CHECK-NEXT: v_writelane_b32 v6, s52, 10
+; CHECK-NEXT: v_writelane_b32 v6, s53, 11
+; CHECK-NEXT: v_writelane_b32 v6, s54, 12
+; CHECK-NEXT: v_writelane_b32 v6, s55, 13
+; CHECK-NEXT: v_writelane_b32 v6, s64, 14
+; CHECK-NEXT: v_writelane_b32 v6, s65, 15
+; CHECK-NEXT: v_writelane_b32 v6, s66, 16
+; CHECK-NEXT: v_writelane_b32 v6, s67, 17
+; CHECK-NEXT: v_writelane_b32 v6, s68, 18
+; CHECK-NEXT: s_getpc_b64 s[4:5]
+; CHECK-NEXT: s_mov_b64 s[8:9], 0
+; CHECK-NEXT: v_writelane_b32 v6, s69, 19
; CHECK-NEXT: s_mov_b32 s68, 0
-; CHECK-NEXT: v_writelane_b32 v5, s71, 21
+; CHECK-NEXT: s_mov_b32 s69, s4
+; CHECK-NEXT: s_load_dwordx4 s[4:7], s[8:9], 0x0
+; CHECK-NEXT: s_load_dwordx8 s[24:31], s[68:69], 0x30
+; CHECK-NEXT: s_load_dwordx16 s[52:67], s[68:69], 0xf0
+; CHECK-NEXT: ; kill: killed $sgpr8_sgpr9
+; CHECK-NEXT: s_nop 0
+; CHECK-NEXT: s_load_dwordx16 s[8:23], s[68:69], 0x130
+; CHECK-NEXT: ; implicit-def: $vgpr7 : SGPR spill to VGPR lane
+; CHECK-NEXT: v_writelane_b32 v6, s70, 20
+; CHECK-NEXT: v_writelane_b32 v6, s71, 21
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s4
; CHECK-NEXT: v_mov_b32_e32 v2, 0
+; CHECK-NEXT: v_writelane_b32 v7, s8, 0
+; CHECK-NEXT: v_writelane_b32 v7, s9, 1
+; CHECK-NEXT: v_writelane_b32 v7, s10, 2
+; CHECK-NEXT: v_writelane_b32 v7, s11, 3
+; CHECK-NEXT: v_writelane_b32 v7, s12, 4
+; CHECK-NEXT: v_writelane_b32 v7, s13, 5
+; CHECK-NEXT: v_writelane_b32 v7, s14, 6
+; CHECK-NEXT: v_writelane_b32 v7, s15, 7
+; CHECK-NEXT: v_writelane_b32 v7, s16, 8
+; CHECK-NEXT: v_writelane_b32 v7, s17, 9
+; CHECK-NEXT: v_writelane_b32 v7, s18, 10
+; CHECK-NEXT: v_writelane_b32 v7, s19, 11
+; CHECK-NEXT: v_writelane_b32 v7, s20, 12
+; CHECK-NEXT: v_writelane_b32 v7, s21, 13
+; CHECK-NEXT: v_writelane_b32 v7, s22, 14
+; CHECK-NEXT: v_writelane_b32 v7, s23, 15
+; CHECK-NEXT: v_writelane_b32 v7, s52, 16
+; CHECK-NEXT: v_writelane_b32 v7, s53, 17
+; CHECK-NEXT: v_writelane_b32 v7, s54, 18
+; CHECK-NEXT: v_writelane_b32 v7, s55, 19
+; CHECK-NEXT: v_writelane_b32 v7, s56, 20
+; CHECK-NEXT: v_writelane_b32 v7, s57, 21
+; CHECK-NEXT: v_writelane_b32 v7, s58, 22
+; CHECK-NEXT: v_writelane_b32 v7, s59, 23
+; CHECK-NEXT: v_writelane_b32 v7, s60, 24
+; CHECK-NEXT: v_writelane_b32 v7, s61, 25
+; CHECK-NEXT: v_writelane_b32 v7, s62, 26
+; CHECK-NEXT: v_writelane_b32 v7, s63, 27
+; CHECK-NEXT: v_writelane_b32 v7, s64, 28
+; CHECK-NEXT: v_writelane_b32 v7, s65, 29
+; CHECK-NEXT: v_writelane_b32 v7, s66, 30
+; CHECK-NEXT: s_load_dwordx16 s[8:23], s[68:69], 0x1f0
+; CHECK-NEXT: s_load_dwordx16 s[36:51], s[68:69], 0x2f0
; CHECK-NEXT: s_mov_b32 s69, s68
; CHECK-NEXT: s_mov_b32 s70, s68
; CHECK-NEXT: s_mov_b32 s71, s68
-; CHECK-NEXT: image_sample_lz v3, v[1:2], s[16:23], s[68:71] dmask:0x1
+; CHECK-NEXT: v_writelane_b32 v7, s67, 31
+; CHECK-NEXT: image_sample_lz v3, v[1:2], s[60:67], s[68:71] dmask:0x1
+; CHECK-NEXT: v_readlane_b32 s52, v7, 0
; CHECK-NEXT: v_mov_b32_e32 v1, v2
-; CHECK-NEXT: ; implicit-def: $vgpr6 : SGPR spill to VGPR lane
-; CHECK-NEXT: s_mov_b32 s6, 48
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_writelane_b32 v6, s36, 0
-; CHECK-NEXT: v_writelane_b32 v6, s37, 1
-; CHECK-NEXT: v_writelane_b32 v6, s38, 2
-; CHECK-NEXT: v_writelane_b32 v6, s39, 3
-; CHECK-NEXT: v_writelane_b32 v6, s40, 4
-; CHECK-NEXT: v_writelane_b32 v6, s41, 5
-; CHECK-NEXT: image_sample_lz v4, v[1:2], s[36:43], s[68:71] dmask:0x1
-; CHECK-NEXT: v_writelane_b32 v6, s42, 6
-; CHECK-NEXT: v_writelane_b32 v6, s43, 7
-; CHECK-NEXT: v_writelane_b32 v6, s44, 8
-; CHECK-NEXT: v_writelane_b32 v6, s45, 9
-; CHECK-NEXT: v_writelane_b32 v6, s46, 10
-; CHECK-NEXT: v_writelane_b32 v6, s47, 11
-; CHECK-NEXT: v_writelane_b32 v6, s48, 12
-; CHECK-NEXT: v_writelane_b32 v6, s49, 13
-; CHECK-NEXT: v_writelane_b32 v6, s50, 14
-; CHECK-NEXT: s_movk_i32 s56, 0x1f0
-; CHECK-NEXT: s_movk_i32 s72, 0x2f0
-; CHECK-NEXT: s_mov_b32 s57, s24
-; CHECK-NEXT: s_mov_b32 s73, s24
-; CHECK-NEXT: v_writelane_b32 v6, s51, 15
-; CHECK-NEXT: s_load_dwordx8 s[24:31], s[6:7], 0x0
-; CHECK-NEXT: s_load_dwordx16 s[36:51], s[56:57], 0x0
-; CHECK-NEXT: v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT: s_load_dwordx16 s[52:67], s[72:73], 0x0
-; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v0
+; CHECK-NEXT: v_readlane_b32 s53, v7, 1
+; CHECK-NEXT: v_readlane_b32 s54, v7, 2
+; CHECK-NEXT: v_readlane_b32 s55, v7, 3
+; CHECK-NEXT: v_readlane_b32 s56, v7, 4
+; CHECK-NEXT: v_readlane_b32 s57, v7, 5
+; CHECK-NEXT: v_readlane_b32 s58, v7, 6
+; CHECK-NEXT: v_readlane_b32 s59, v7, 7
+; CHECK-NEXT: v_and_b32_e32 v5, 1, v0
+; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], 1, v5
+; CHECK-NEXT: v_readlane_b32 s60, v7, 8
+; CHECK-NEXT: v_readlane_b32 s61, v7, 9
+; CHECK-NEXT: v_readlane_b32 s62, v7, 10
+; CHECK-NEXT: image_sample_lz v4, v[1:2], s[52:59], s[68:71] dmask:0x1
+; CHECK-NEXT: v_readlane_b32 s63, v7, 11
+; CHECK-NEXT: v_readlane_b32 s64, v7, 12
+; CHECK-NEXT: v_readlane_b32 s65, v7, 13
+; CHECK-NEXT: v_readlane_b32 s66, v7, 14
+; CHECK-NEXT: v_readlane_b32 s67, v7, 15
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_mul_f32_e32 v0, v4, v3
; CHECK-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
; CHECK-NEXT: s_xor_b64 s[6:7], exec, s[6:7]
; CHECK-NEXT: s_cbranch_execz .LBB0_3
; CHECK-NEXT: ; %bb.1: ; %bb48
-; CHECK-NEXT: image_sample_lz v3, v[1:2], s[16:23], s[68:71] dmask:0x1
-; CHECK-NEXT: v_mov_b32_e32 v1, v2
+; CHECK-NEXT: v_readlane_b32 s52, v7, 16
+; CHECK-NEXT: v_readlane_b32 s60, v7, 24
+; CHECK-NEXT: v_readlane_b32 s61, v7, 25
+; CHECK-NEXT: v_readlane_b32 s62, v7, 26
+; CHECK-NEXT: v_readlane_b32 s63, v7, 27
+; CHECK-NEXT: v_readlane_b32 s64, v7, 28
+; CHECK-NEXT: v_readlane_b32 s65, v7, 29
+; CHECK-NEXT: v_readlane_b32 s66, v7, 30
+; CHECK-NEXT: v_readlane_b32 s67, v7, 31
; CHECK-NEXT: s_and_b64 vcc, exec, -1
+; CHECK-NEXT: v_readlane_b32 s53, v7, 17
+; CHECK-NEXT: v_readlane_b32 s54, v7, 18
+; CHECK-NEXT: v_readlane_b32 s55, v7, 19
+; CHECK-NEXT: v_readlane_b32 s56, v7, 20
+; CHECK-NEXT: image_sample_lz v3, v[1:2], s[60:67], s[68:71] dmask:0x1
+; CHECK-NEXT: v_mov_b32_e32 v1, v2
+; CHECK-NEXT: v_readlane_b32 s57, v7, 21
+; CHECK-NEXT: v_readlane_b32 s58, v7, 22
+; CHECK-NEXT: v_readlane_b32 s59, v7, 23
; CHECK-NEXT: .LBB0_2: ; %bb50
; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: s_mov_b32 s69, s68
-; CHECK-NEXT: s_mov_b32 s70, s68
-; CHECK-NEXT: s_mov_b32 s71, s68
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: image_sample_lz v4, v[1:2], s[44:51], s[28:31] dmask:0x1
+; CHECK-NEXT: image_sample_lz v4, v[1:2], s[16:23], s[28:31] dmask:0x1
; CHECK-NEXT: s_nop 0
-; CHECK-NEXT: image_sample_lz v1, v[1:2], s[60:67], s[68:71] dmask:0x1
+; CHECK-NEXT: image_sample_lz v1, v[1:2], s[44:51], s[68:71] dmask:0x1
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_sub_f32_e32 v1, v1, v4
; CHECK-NEXT: v_mul_f32_e32 v1, v1, v0
@@ -103,60 +141,75 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: s_mov_b64 vcc, vcc
; CHECK-NEXT: s_cbranch_vccnz .LBB0_2
; CHECK-NEXT: .LBB0_3: ; %Flow14
-; CHECK-NEXT: s_andn2_saveexec_b64 s[20:21], s[6:7]
+; CHECK-NEXT: s_andn2_saveexec_b64 s[6:7], s[6:7]
; CHECK-NEXT: s_cbranch_execz .LBB0_10
; CHECK-NEXT: ; %bb.4: ; %bb32
+; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_and_saveexec_b64 s[16:17], s[4:5]
-; CHECK-NEXT: s_xor_b64 s[22:23], exec, s[16:17]
+; CHECK-NEXT: s_xor_b64 s[4:5], exec, s[16:17]
; CHECK-NEXT: s_cbranch_execz .LBB0_6
; CHECK-NEXT: ; %bb.5: ; %bb43
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: s_mov_b32 s44, 0
-; CHECK-NEXT: s_mov_b32 s45, s44
-; CHECK-NEXT: v_mov_b32_e32 v2, s44
-; CHECK-NEXT: v_mov_b32_e32 v3, s45
-; CHECK-NEXT: s_mov_b32 s46, s44
-; CHECK-NEXT: s_mov_b32 s47, s44
-; CHECK-NEXT: image_sample_lz v1, v[2:3], s[8:15], s[44:47] dmask:0x1
-; CHECK-NEXT: v_readlane_b32 s4, v6, 0
-; CHECK-NEXT: v_readlane_b32 s12, v6, 8
-; CHECK-NEXT: v_readlane_b32 s13, v6, 9
-; CHECK-NEXT: v_readlane_b32 s14, v6, 10
-; CHECK-NEXT: v_readlane_b32 s15, v6, 11
-; CHECK-NEXT: v_readlane_b32 s16, v6, 12
-; CHECK-NEXT: v_readlane_b32 s17, v6, 13
-; CHECK-NEXT: v_readlane_b32 s18, v6, 14
-; CHECK-NEXT: v_readlane_b32 s19, v6, 15
-; CHECK-NEXT: v_readlane_b32 s5, v6, 1
-; CHECK-NEXT: v_readlane_b32 s6, v6, 2
-; CHECK-NEXT: v_readlane_b32 s7, v6, 3
-; CHECK-NEXT: v_readlane_b32 s8, v6, 4
-; CHECK-NEXT: v_readlane_b32 s9, v6, 5
-; CHECK-NEXT: image_sample_lz v0, v[2:3], s[12:19], s[24:27] dmask:0x1
-; CHECK-NEXT: v_mov_b32_e32 v2, 0
-; CHECK-NEXT: v_mov_b32_e32 v3, v2
-; CHECK-NEXT: v_readlane_b32 s10, v6, 6
-; CHECK-NEXT: v_readlane_b32 s11, v6, 7
+; CHECK-NEXT: s_mov_b32 s16, 0
+; CHECK-NEXT: s_mov_b32 s17, s16
+; CHECK-NEXT: v_mov_b32_e32 v0, s16
+; CHECK-NEXT: v_readlane_b32 s44, v7, 16
+; CHECK-NEXT: v_mov_b32_e32 v1, s17
+; CHECK-NEXT: s_mov_b32 s18, s16
+; CHECK-NEXT: s_mov_b32 s19, s16
+; CHECK-NEXT: v_readlane_b32 s45, v7, 17
+; CHECK-NEXT: v_readlane_b32 s46, v7, 18
+; CHECK-NEXT: v_readlane_b32 s47, v7, 19
+; CHECK-NEXT: v_readlane_b32 s48, v7, 20
+; CHECK-NEXT: v_readlane_b32 s49, v7, 21
+; CHECK-NEXT: v_readlane_b32 s50, v7, 22
+; CHECK-NEXT: v_readlane_b32 s51, v7, 23
+; CHECK-NEXT: v_readlane_b32 s52, v7, 24
+; CHECK-NEXT: v_readlane_b32 s53, v7, 25
+; CHECK-NEXT: v_readlane_b32 s54, v7, 26
+; CHECK-NEXT: v_readlane_b32 s55, v7, 27
+; CHECK-NEXT: v_readlane_b32 s56, v7, 28
+; CHECK-NEXT: v_readlane_b32 s57, v7, 29
+; CHECK-NEXT: v_readlane_b32 s58, v7, 30
+; CHECK-NEXT: v_readlane_b32 s59, v7, 31
+; CHECK-NEXT: image_sample_lz v2, v[0:1], s[44:51], s[16:19] dmask:0x1
+; CHECK-NEXT: v_readlane_b32 s44, v7, 0
+; CHECK-NEXT: v_readlane_b32 s52, v7, 8
+; CHECK-NEXT: v_readlane_b32 s53, v7, 9
+; CHECK-NEXT: v_readlane_b32 s54, v7, 10
+; CHECK-NEXT: v_readlane_b32 s55, v7, 11
+; CHECK-NEXT: v_readlane_b32 s56, v7, 12
+; CHECK-NEXT: v_readlane_b32 s57, v7, 13
+; CHECK-NEXT: v_readlane_b32 s58, v7, 14
+; CHECK-NEXT: v_readlane_b32 s59, v7, 15
+; CHECK-NEXT: v_mov_b32_e32 v3, 0
+; CHECK-NEXT: v_mov_b32_e32 v4, v3
+; CHECK-NEXT: v_readlane_b32 s45, v7, 1
+; CHECK-NEXT: v_readlane_b32 s46, v7, 2
+; CHECK-NEXT: v_readlane_b32 s47, v7, 3
+; CHECK-NEXT: image_sample_lz v0, v[0:1], s[52:59], s[24:27] dmask:0x1
+; CHECK-NEXT: v_readlane_b32 s48, v7, 4
+; CHECK-NEXT: v_readlane_b32 s49, v7, 5
+; CHECK-NEXT: v_readlane_b32 s50, v7, 6
+; CHECK-NEXT: v_readlane_b32 s51, v7, 7
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_dwordx3 v[1:3], off, s[44:47], 0
+; CHECK-NEXT: buffer_store_dwordx3 v[2:4], off, s[16:19], 0
; CHECK-NEXT: s_waitcnt vmcnt(1)
-; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[44:47], 0
+; CHECK-NEXT: buffer_store_dwordx4 v[0:3], off, s[16:19], 0
; CHECK-NEXT: ; implicit-def: $vgpr0
; CHECK-NEXT: .LBB0_6: ; %Flow12
-; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[22:23]
+; CHECK-NEXT: s_andn2_saveexec_b64 s[4:5], s[4:5]
; CHECK-NEXT: s_cbranch_execz .LBB0_9
; CHECK-NEXT: ; %bb.7: ; %bb33.preheader
-; CHECK-NEXT: s_mov_b32 s8, 0
-; CHECK-NEXT: s_mov_b32 s12, s8
-; CHECK-NEXT: s_mov_b32 s13, s8
-; CHECK-NEXT: v_mov_b32_e32 v1, s12
-; CHECK-NEXT: s_mov_b32 s9, s8
-; CHECK-NEXT: s_mov_b32 s10, s8
-; CHECK-NEXT: s_mov_b32 s11, s8
-; CHECK-NEXT: v_mov_b32_e32 v2, s13
-; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: image_sample_lz v3, v[1:2], s[36:43], s[8:11] dmask:0x1
-; CHECK-NEXT: image_sample_lz v4, v[1:2], s[52:59], s[8:11] dmask:0x1
+; CHECK-NEXT: s_mov_b32 s16, 0
+; CHECK-NEXT: s_mov_b32 s20, s16
+; CHECK-NEXT: s_mov_b32 s21, s16
+; CHECK-NEXT: v_mov_b32_e32 v1, s20
+; CHECK-NEXT: s_mov_b32 s17, s16
+; CHECK-NEXT: s_mov_b32 s18, s16
+; CHECK-NEXT: s_mov_b32 s19, s16
+; CHECK-NEXT: v_mov_b32_e32 v2, s21
+; CHECK-NEXT: image_sample_lz v3, v[1:2], s[8:15], s[16:19] dmask:0x1
+; CHECK-NEXT: image_sample_lz v4, v[1:2], s[36:43], s[16:19] dmask:0x1
; CHECK-NEXT: s_and_b64 vcc, exec, 0
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: v_sub_f32_e32 v1, v4, v3
@@ -171,33 +224,33 @@ define void @main(i1 %arg) #0 {
; CHECK-NEXT: .LBB0_9: ; %Flow13
; CHECK-NEXT: s_or_b64 exec, exec, s[4:5]
; CHECK-NEXT: .LBB0_10: ; %UnifiedReturnBlock
-; CHECK-NEXT: s_or_b64 exec, exec, s[20:21]
-; CHECK-NEXT: v_readlane_b32 s71, v5, 21
-; CHECK-NEXT: v_readlane_b32 s70, v5, 20
-; CHECK-NEXT: v_readlane_b32 s69, v5, 19
-; CHECK-NEXT: v_readlane_b32 s68, v5, 18
+; CHECK-NEXT: s_or_b64 exec, exec, s[6:7]
+; CHECK-NEXT: v_readlane_b32 s71, v6, 21
+; CHECK-NEXT: v_readlane_b32 s70, v6, 20
+; CHECK-NEXT: v_readlane_b32 s69, v6, 19
+; CHECK-NEXT: v_readlane_b32 s68, v6, 18
+; CHECK-NEXT: v_readlane_b32 s67, v6, 17
+; CHECK-NEXT: v_readlane_b32 s66, v6, 16
+; CHECK-NEXT: v_readlane_b32 s65, v6, 15
+; CHECK-NEXT: v_readlane_b32 s64, v6, 14
+; CHECK-NEXT: v_readlane_b32 s55, v6, 13
+; CHECK-NEXT: v_readlane_b32 s54, v6, 12
+; CHECK-NEXT: v_readlane_b32 s53, v6, 11
+; CHECK-NEXT: v_readlane_b32 s52, v6, 10
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
-; CHECK-NEXT: v_readlane_b32 s67, v5, 17
-; CHECK-NEXT: v_readlane_b32 s66, v5, 16
-; CHECK-NEXT: v_readlane_b32 s65, v5, 15
-; CHECK-NEXT: v_readlane_b32 s64, v5, 14
-; CHECK-NEXT: v_readlane_b32 s55, v5, 13
-; CHECK-NEXT: v_readlane_b32 s54, v5, 12
-; CHECK-NEXT: v_readlane_b32 s53, v5, 11
-; CHECK-NEXT: v_readlane_b32 s52, v5, 10
-; CHECK-NEXT: v_readlane_b32 s51, v5, 9
-; CHECK-NEXT: v_readlane_b32 s50, v5, 8
-; CHECK-NEXT: v_readlane_b32 s49, v5, 7
-; CHECK-NEXT: v_readlane_b32 s48, v5, 6
-; CHECK-NEXT: v_readlane_b32 s39, v5, 5
-; CHECK-NEXT: v_readlane_b32 s38, v5, 4
-; CHECK-NEXT: v_readlane_b32 s37, v5, 3
-; CHECK-NEXT: v_readlane_b32 s36, v5, 2
-; CHECK-NEXT: v_readlane_b32 s31, v5, 1
-; CHECK-NEXT: v_readlane_b32 s30, v5, 0
+; CHECK-NEXT: v_readlane_b32 s51, v6, 9
+; CHECK-NEXT: v_readlane_b32 s50, v6, 8
+; CHECK-NEXT: v_readlane_b32 s49, v6, 7
+; CHECK-NEXT: v_readlane_b32 s48, v6, 6
+; CHECK-NEXT: v_readlane_b32 s39, v6, 5
+; CHECK-NEXT: v_readlane_b32 s38, v6, 4
+; CHECK-NEXT: v_readlane_b32 s37, v6, 3
+; CHECK-NEXT: v_readlane_b32 s36, v6, 2
+; CHECK-NEXT: v_readlane_b32 s31, v6, 1
+; CHECK-NEXT: v_readlane_b32 s30, v6, 0
; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1
-; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 ; 4-byte Folded Reload
-; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 ; 4-byte Folded Reload
+; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
index 59dfd71..bd11b07 100644
--- a/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
+++ b/llvm/test/CodeGen/AMDGPU/infer-addrspace-flat-atomic.ll
@@ -11,8 +11,8 @@ define protected amdgpu_kernel void @InferNothing(i32 %a, ptr %b, double %c) {
; CHECK-NEXT: v_mov_b32_e32 v2, s2
; CHECK-NEXT: v_mov_b32_e32 v3, s3
; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3
-; CHECK-NEXT: s_add_u32 s0, s2, s0
-; CHECK-NEXT: s_addc_u32 s1, s3, s1
+; CHECK-NEXT: s_add_u32 s0, s0, s2
+; CHECK-NEXT: s_addc_u32 s1, s1, s3
; CHECK-NEXT: v_mov_b32_e32 v1, s1
; CHECK-NEXT: v_add_co_u32_e64 v0, vcc, -8, s0
; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
@@ -69,13 +69,13 @@ define protected amdgpu_kernel void @InferMixed(i32 %a, ptr addrspace(1) %b, dou
; CHECK-NEXT: s_lshl_b64 s[2:3], s[6:7], 3
; CHECK-NEXT: s_add_u32 s0, s0, s2
; CHECK-NEXT: s_addc_u32 s1, s1, s3
+; CHECK-NEXT: s_add_u32 s0, s0, -8
+; CHECK-NEXT: s_addc_u32 s1, s1, -1
; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_wbinvl1_vol
-; CHECK-NEXT: v_mov_b32_e32 v1, s1
-; CHECK-NEXT: v_add_co_u32_e64 v0, vcc, -7, s0
-; CHECK-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v1, vcc
-; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3]
+; CHECK-NEXT: v_pk_mov_b32 v[0:1], s[0:1], s[0:1] op_sel:[0,1]
+; CHECK-NEXT: flat_atomic_add_f64 v[0:1], v[2:3] offset:1
; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
; CHECK-NEXT: buffer_wbinvl1_vol
; CHECK-NEXT: s_endpgm
@@ -113,7 +113,7 @@ define protected amdgpu_kernel void @InferPHI(i32 %a, ptr addrspace(1) %b, doubl
; CHECK-NEXT: s_addc_u32 s1, s1, s5
; CHECK-NEXT: s_add_u32 s4, s0, -8
; CHECK-NEXT: s_addc_u32 s5, s1, -1
-; CHECK-NEXT: s_cmp_eq_u64 s[0:1], 9
+; CHECK-NEXT: s_cmp_eq_u64 s[4:5], 1
; CHECK-NEXT: s_cselect_b64 s[0:1], -1, 0
; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
; CHECK-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v0
diff --git a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
index 48bf7fb..3eef616 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-frame-extern.ll
@@ -46,8 +46,8 @@ define void @use_extern_normal() #0 {
; CHECK-NEXT: s_ashr_i32 s5, s15, 31
; CHECK-NEXT: v_mov_b32_e32 v0, 0x4048f5c3
; CHECK-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; CHECK-NEXT: s_add_u32 s4, s4, s6
-; CHECK-NEXT: s_addc_u32 s5, s5, s7
+; CHECK-NEXT: s_add_u32 s4, s6, s4
+; CHECK-NEXT: s_addc_u32 s5, s7, s5
; CHECK-NEXT: s_load_dword s4, s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s4
@@ -70,8 +70,8 @@ define void @use_extern_overalign() #0 {
; CHECK-NEXT: s_ashr_i32 s5, s15, 31
; CHECK-NEXT: v_mov_b32_e32 v0, 0x42280000
; CHECK-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; CHECK-NEXT: s_add_u32 s4, s4, s6
-; CHECK-NEXT: s_addc_u32 s5, s5, s7
+; CHECK-NEXT: s_add_u32 s4, s6, s4
+; CHECK-NEXT: s_addc_u32 s5, s7, s5
; CHECK-NEXT: s_load_dword s4, s[4:5], 0x0
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: v_mov_b32_e32 v1, s4
diff --git a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
index ca77482..fa52b96 100644
--- a/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
+++ b/llvm/test/CodeGen/AMDGPU/limit-coalesce.mir
@@ -1,19 +1,9 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
# RUN: llc -mtriple=amdgcn -run-pass register-coalescer -o - %s | FileCheck %s
-# Check that coalescer does not create wider register tuple than in source
-
-# CHECK: - { id: 2, class: vreg_64, preferred-register: '', flags: [ ] }
-# CHECK: - { id: 3, class: vreg_64, preferred-register: '', flags: [ ] }
-# CHECK: - { id: 4, class: vreg_64, preferred-register: '', flags: [ ] }
-# CHECK: - { id: 5, class: vreg_96, preferred-register: '', flags: [ ] }
-# CHECK: - { id: 6, class: vreg_96, preferred-register: '', flags: [ ] }
-# CHECK: - { id: 7, class: vreg_128, preferred-register: '', flags: [ ] }
-# CHECK: - { id: 8, class: vreg_128, preferred-register: '', flags: [ ] }
+# Check that coalescer does not create wider register tuple than in
+# source.
# No more registers shall be defined
-# CHECK-NEXT: liveins:
-# CHECK: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, %4,
-# CHECK: FLAT_STORE_DWORDX3 $vgpr0_vgpr1, %6,
-
---
name: main
alignment: 1
@@ -52,6 +42,23 @@ body: |
bb.0.entry:
liveins: $sgpr0, $vgpr0_vgpr1
+ ; CHECK-LABEL: name: main
+ ; CHECK: liveins: $sgpr0, $vgpr0_vgpr1
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[DEF:%[0-9]+]]:vreg_64 = IMPLICIT_DEF
+ ; CHECK-NEXT: undef [[COPY:%[0-9]+]].sub0:vreg_64 = COPY $sgpr0
+ ; CHECK-NEXT: [[COPY:%[0-9]+]].sub1:vreg_64 = COPY [[DEF]].sub0
+ ; CHECK-NEXT: undef [[COPY1:%[0-9]+]].sub0:vreg_64 = COPY [[COPY]].sub1
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]].sub1:vreg_64 = COPY [[COPY]].sub0
+ ; CHECK-NEXT: FLAT_STORE_DWORDX2 $vgpr0_vgpr1, [[COPY1]], 0, 0, implicit $exec, implicit $flat_scr
+ ; CHECK-NEXT: [[DEF1:%[0-9]+]]:vreg_96 = IMPLICIT_DEF
+ ; CHECK-NEXT: undef [[COPY2:%[0-9]+]].sub0_sub1:vreg_96 = COPY [[DEF1]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]].sub2:vreg_96 = COPY [[DEF]].sub0
+ ; CHECK-NEXT: FLAT_STORE_DWORDX3 $vgpr0_vgpr1, [[COPY2]], 0, 0, implicit $exec, implicit $flat_scr
+ ; CHECK-NEXT: [[DEF2:%[0-9]+]]:vreg_128 = IMPLICIT_DEF
+ ; CHECK-NEXT: undef [[COPY3:%[0-9]+]].sub0_sub1_sub2:vreg_128 = COPY [[DEF2]]
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]].sub3:vreg_128 = COPY [[DEF]].sub0
+ ; CHECK-NEXT: FLAT_STORE_DWORDX4 $vgpr0_vgpr1, [[COPY3]], 0, 0, implicit $exec, implicit $flat_scr
%3 = IMPLICIT_DEF
undef %4.sub0 = COPY $sgpr0
%4.sub1 = COPY %3.sub0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
index de7d234..b9bf76c 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.quadmask.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=1 < %s | FileCheck -check-prefixes=GFX11,GFX11-GISEL %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -global-isel=0 < %s | FileCheck -check-prefixes=GFX11,GFX11-SDAG %s
declare i32 @llvm.amdgcn.s.quadmask.i32(i32)
declare i64 @llvm.amdgcn.s.quadmask.i64(i64)
@@ -172,3 +172,91 @@ entry:
%qm = call i64 @llvm.amdgcn.s.quadmask.i64(i64 %mask)
ret i64 %qm
}
+
+;; Ensure that AND/ICMP cannot be fused into an AND because s_quadmask_b32 implicitly defines SCC.
+define amdgpu_kernel void @test_scc_quadmask_32(i32 %val0, i32 %val1, ptr addrspace(1) %ptr) {
+; GFX11-GISEL-LABEL: test_scc_quadmask_32:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, 0
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_and_b32 s0, s0, 1
+; GFX11-GISEL-NEXT: s_quadmask_b32 s1, s1
+; GFX11-GISEL-NEXT: s_cmp_eq_u32 s0, 0
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, s1
+; GFX11-GISEL-NEXT: s_cselect_b32 s0, 1, 0
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, s0
+; GFX11-GISEL-NEXT: global_store_b32 v2, v3, s[2:3]
+; GFX11-GISEL-NEXT: global_store_b32 v[0:1], v4, off
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: test_scc_quadmask_32:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x24
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_and_b32 s0, s0, 1
+; GFX11-SDAG-NEXT: s_quadmask_b32 s1, s1
+; GFX11-SDAG-NEXT: s_cmp_eq_u32 s0, 0
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1
+; GFX11-SDAG-NEXT: s_cselect_b32 s0, -1, 0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, s0
+; GFX11-SDAG-NEXT: global_store_b32 v2, v3, s[2:3]
+; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v4, off
+; GFX11-SDAG-NEXT: s_endpgm
+ %and = and i32 %val0, 1
+ %result = call i32 @llvm.amdgcn.s.quadmask.i32(i32 %val1) nounwind readnone
+ store i32 %result, ptr addrspace(1) %ptr
+ %cmp = icmp eq i32 %and, 0
+ %sel = select i1 %cmp, i32 1, i32 0
+ store i32 %sel, ptr addrspace(1) null, align 4
+ ret void
+}
+
+;; Ensure that AND/ICMP cannot be fused into an AND because s_quadmask_b64 implicitly defines SCC.
+define amdgpu_kernel void @test_scc_quadmask_64(i32 %val0, i64 %val1, ptr addrspace(1) %ptr) {
+; GFX11-GISEL-LABEL: test_scc_quadmask_64:
+; GFX11-GISEL: ; %bb.0:
+; GFX11-GISEL-NEXT: s_clause 0x1
+; GFX11-GISEL-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-GISEL-NEXT: s_load_b32 s4, s[4:5], 0x24
+; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-GISEL-NEXT: s_quadmask_b64 s[0:1], s[0:1]
+; GFX11-GISEL-NEXT: s_and_b32 s4, s4, 1
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX11-GISEL-NEXT: s_cmp_eq_u32 s4, 0
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v1, s1
+; GFX11-GISEL-NEXT: s_cselect_b32 s0, 1, 0
+; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v5, s0
+; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GFX11-GISEL-NEXT: global_store_b64 v4, v[0:1], s[2:3]
+; GFX11-GISEL-NEXT: global_store_b32 v[2:3], v5, off
+; GFX11-GISEL-NEXT: s_endpgm
+;
+; GFX11-SDAG-LABEL: test_scc_quadmask_64:
+; GFX11-SDAG: ; %bb.0:
+; GFX11-SDAG-NEXT: s_clause 0x1
+; GFX11-SDAG-NEXT: s_load_b32 s6, s[4:5], 0x24
+; GFX11-SDAG-NEXT: s_load_b128 s[0:3], s[4:5], 0x2c
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 0
+; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0)
+; GFX11-SDAG-NEXT: s_and_b32 s4, s6, 1
+; GFX11-SDAG-NEXT: s_quadmask_b64 s[0:1], s[0:1]
+; GFX11-SDAG-NEXT: s_cmp_eq_u32 s4, 0
+; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v3, s1
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, s0
+; GFX11-SDAG-NEXT: s_cselect_b32 s0, -1, 0
+; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 0
+; GFX11-SDAG-NEXT: v_cndmask_b32_e64 v5, 0, 1, s0
+; GFX11-SDAG-NEXT: global_store_b64 v4, v[2:3], s[2:3]
+; GFX11-SDAG-NEXT: global_store_b32 v[0:1], v5, off
+; GFX11-SDAG-NEXT: s_endpgm
+ %and = and i32 %val0, 1
+ %result = call i64 @llvm.amdgcn.s.quadmask.i64(i64 %val1) nounwind readnone
+ store i64 %result, ptr addrspace(1) %ptr
+ %cmp = icmp eq i32 %and, 0
+ %sel = select i1 %cmp, i32 1, i32 0
+ store i32 %sel, ptr addrspace(1) null, align 4
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
index ea9d5e8..1e6b77e 100644
--- a/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
+++ b/llvm/test/CodeGen/AMDGPU/loop-prefetch-data.ll
@@ -400,9 +400,9 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v2, s1, v0, s6
+; GFX12-NEXT: v_add_co_u32 v2, s1, s6, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, s7, s1
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1
; GFX12-NEXT: v_add_co_u32 v0, s1, s4, v0
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2
; GFX12-NEXT: s_wait_alu 0xf1ff
@@ -438,9 +438,9 @@ define amdgpu_kernel void @copy_flat_divergent(ptr nocapture %d, ptr nocapture r
; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-SPREFETCH-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
-; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, s1, v0, s6
+; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, s1, s6, v0
; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, s7, s1
+; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1
; GFX12-SPREFETCH-NEXT: v_add_co_u32 v0, s1, s4, v0
; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2
; GFX12-SPREFETCH-NEXT: s_wait_alu 0xf1ff
@@ -531,9 +531,9 @@ define amdgpu_kernel void @copy_global_divergent(ptr addrspace(1) nocapture %d,
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GFX12-NEXT: s_wait_kmcnt 0x0
-; GFX12-NEXT: v_add_co_u32 v2, s1, v0, s6
+; GFX12-NEXT: v_add_co_u32 v2, s1, s6, v0
; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, 0, s7, s1
+; GFX12-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1
; GFX12-NEXT: v_add_co_u32 v0, s1, s4, v0
; GFX12-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2
; GFX12-NEXT: s_wait_alu 0xf1ff
@@ -569,9 +569,9 @@ define amdgpu_kernel void @copy_global_divergent(ptr addrspace(1) nocapture %d,
; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX12-SPREFETCH-NEXT: v_lshlrev_b32_e32 v0, 4, v0
; GFX12-SPREFETCH-NEXT: s_wait_kmcnt 0x0
-; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, s1, v0, s6
+; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, s1, s6, v0
; GFX12-SPREFETCH-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, 0, s7, s1
+; GFX12-SPREFETCH-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s1
; GFX12-SPREFETCH-NEXT: v_add_co_u32 v0, s1, s4, v0
; GFX12-SPREFETCH-NEXT: v_add_co_u32 v2, vcc_lo, 0xb0, v2
; GFX12-SPREFETCH-NEXT: s_wait_alu 0xf1ff
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll
index 0de7f8f..bd29e9e 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-precise-allocate-to-module-struct.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --function-signature --check-globals
-; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds --amdgpu-lower-module-lds-strategy=module < %s | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s | FileCheck %s
; Regression test for issue 160181
; One variable is chosen to be assigned at zero. Here, that's @both
@@ -22,12 +22,20 @@
;.
; CHECK: @llvm.amdgcn.module.lds = internal addrspace(3) global %llvm.amdgcn.module.lds.t poison, align 4, !absolute_symbol [[META0:![0-9]+]]
; CHECK: @llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @llvm.amdgcn.module.lds to ptr)], section "llvm.metadata"
+; CHECK: @llvm.amdgcn.kernel.kern_one.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kern_one.lds.t poison, align 4, !absolute_symbol [[META1:![0-9]+]]
+; CHECK: @llvm.amdgcn.kernel.kern_two.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kern_two.lds.t poison, align 4, !absolute_symbol [[META1]]
+; CHECK: @llvm.amdgcn.kernel.kern_block_direct_allocation.lds = internal addrspace(3) global %llvm.amdgcn.kernel.kern_block_direct_allocation.lds.t poison, align 4, !absolute_symbol [[META1]]
+
;.
define void @func_one() {
; CHECK-LABEL: define {{[^@]+}}@func_one() {
-; CHECK-NEXT: [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META1:![0-9]+]]
-; CHECK-NEXT: store i32 [[VAL0]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META18:![0-9]+]]
-; CHECK-NEXT: store i16 10, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 3), align 4, !noalias [[META23:![0-9]+]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META2:![0-9]+]]
+; CHECK-NEXT: [[ONE:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[ONE]], align 4
+; CHECK-NEXT: [[ONE1:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
+; CHECK-NEXT: store i32 [[VAL0]], ptr addrspace(3) [[ONE1]], align 4
+; CHECK-NEXT: store i16 10, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META11:![0-9]+]]
; CHECK-NEXT: ret void
;
%val0 = load i32, ptr addrspace(3) @both
@@ -38,9 +46,10 @@ define void @func_one() {
define amdgpu_kernel void @kern_one() {
; CHECK-LABEL: define {{[^@]+}}@kern_one
-; CHECK-SAME: () #[[ATTR0:[0-9]+]] {
+; CHECK-SAME: () #[[ATTR0:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META16:![0-9]+]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !noalias [[META24:![0-9]+]]
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kern_one.lds) ]
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !noalias [[META17:![0-9]+]]
; CHECK-NEXT: call void @func_one()
; CHECK-NEXT: ret void
;
@@ -51,9 +60,13 @@ entry:
define void @func_two() {
; CHECK-LABEL: define {{[^@]+}}@func_two() {
-; CHECK-NEXT: [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META1]]
-; CHECK-NEXT: store i32 [[VAL0]], ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 2), align 4, !noalias [[META25:![0-9]+]]
-; CHECK-NEXT: store i16 20, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 3), align 4, !noalias [[META23]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[VAL0:%.*]] = load i32, ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META2]]
+; CHECK-NEXT: [[TWO:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[TWO]], align 4
+; CHECK-NEXT: [[TWO1:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
+; CHECK-NEXT: store i32 [[VAL0]], ptr addrspace(3) [[TWO1]], align 4
+; CHECK-NEXT: store i16 20, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META11]]
; CHECK-NEXT: ret void
;
%val0 = load i32, ptr addrspace(3) @both
@@ -64,9 +77,10 @@ define void @func_two() {
define amdgpu_kernel void @kern_two() {
; CHECK-LABEL: define {{[^@]+}}@kern_two
-; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-SAME: () #[[ATTR0]] !llvm.amdgcn.lds.kernel.id [[META18:![0-9]+]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !alias.scope [[META26:![0-9]+]], !noalias [[META27:![0-9]+]]
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kern_two.lds) ]
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ], !alias.scope [[META19:![0-9]+]], !noalias [[META20:![0-9]+]]
; CHECK-NEXT: call void @func_two()
; CHECK-NEXT: ret void
;
@@ -82,11 +96,18 @@ entry:
; remains the best candidate for address zero allocation.
define void @func_block_direct_allocation() {
; CHECK-LABEL: define {{[^@]+}}@func_block_direct_allocation() {
-; CHECK-NEXT: [[VAL1:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META18]]
-; CHECK-NEXT: [[VAL2:%.*]] = load i32, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 2), align 4, !noalias [[META25]]
+; CHECK-NEXT: [[TMP1:%.*]] = call i32 @llvm.amdgcn.lds.kernel.id()
+; CHECK-NEXT: [[ONE:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 0
+; CHECK-NEXT: [[TMP2:%.*]] = load i32, ptr addrspace(4) [[ONE]], align 4
+; CHECK-NEXT: [[ONE1:%.*]] = inttoptr i32 [[TMP2]] to ptr addrspace(3)
+; CHECK-NEXT: [[VAL1:%.*]] = load i32, ptr addrspace(3) [[ONE1]], align 4
+; CHECK-NEXT: [[TWO:%.*]] = getelementptr inbounds [3 x [2 x i32]], ptr addrspace(4) @llvm.amdgcn.lds.offset.table, i32 0, i32 [[TMP1]], i32 1
+; CHECK-NEXT: [[TMP3:%.*]] = load i32, ptr addrspace(4) [[TWO]], align 4
+; CHECK-NEXT: [[TWO2:%.*]] = inttoptr i32 [[TMP3]] to ptr addrspace(3)
+; CHECK-NEXT: [[VAL2:%.*]] = load i32, ptr addrspace(3) [[TWO2]], align 4
; CHECK-NEXT: [[SUM:%.*]] = add i32 [[VAL1]], [[VAL2]]
-; CHECK-NEXT: store i32 [[SUM]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META1]]
-; CHECK-NEXT: store i16 30, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 3), align 4, !noalias [[META23]]
+; CHECK-NEXT: store i32 [[SUM]], ptr addrspace(3) @llvm.amdgcn.module.lds, align 4, !noalias [[META2]]
+; CHECK-NEXT: store i16 30, ptr addrspace(3) getelementptr inbounds ([[LLVM_AMDGCN_MODULE_LDS_T:%.*]], ptr addrspace(3) @llvm.amdgcn.module.lds, i32 0, i32 1), align 4, !noalias [[META11]]
; CHECK-NEXT: ret void
;
%val1 = load i32, ptr addrspace(3) @one
@@ -99,7 +120,8 @@ define void @func_block_direct_allocation() {
define amdgpu_kernel void @kern_block_direct_allocation() {
; CHECK-LABEL: define {{[^@]+}}@kern_block_direct_allocation
-; CHECK-SAME: () #[[ATTR0]] {
+; CHECK-SAME: () #[[ATTR1:[0-9]+]] !llvm.amdgcn.lds.kernel.id [[META21:![0-9]+]] {
+; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.kernel.kern_block_direct_allocation.lds) ], !alias.scope [[META22:![0-9]+]], !noalias [[META25:![0-9]+]]
; CHECK-NEXT: call void @llvm.donothing() [ "ExplicitUse"(ptr addrspace(3) @llvm.amdgcn.module.lds) ]
; CHECK-NEXT: call void @func_block_direct_allocation()
; CHECK-NEXT: call void @func_one()
@@ -112,35 +134,8 @@ define amdgpu_kernel void @kern_block_direct_allocation() {
ret void
}
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-lds-size"="16" }
-; CHECK: attributes #[[ATTR1:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
-;.
-; CHECK: [[META0]] = !{i32 0, i32 1}
-; CHECK: [[META1]] = !{[[META2:![0-9]+]], [[META4:![0-9]+]], [[META5:![0-9]+]], [[META6:![0-9]+]], [[META8:![0-9]+]], [[META9:![0-9]+]], [[META10:![0-9]+]], [[META12:![0-9]+]], [[META13:![0-9]+]], [[META14:![0-9]+]], [[META16:![0-9]+]], [[META17:![0-9]+]]}
-; CHECK: [[META2]] = distinct !{[[META2]], [[META3:![0-9]+]]}
-; CHECK: [[META3]] = distinct !{[[META3]]}
-; CHECK: [[META4]] = distinct !{[[META4]], [[META3]]}
-; CHECK: [[META5]] = distinct !{[[META5]], [[META3]]}
-; CHECK: [[META6]] = distinct !{[[META6]], [[META7:![0-9]+]]}
-; CHECK: [[META7]] = distinct !{[[META7]]}
-; CHECK: [[META8]] = distinct !{[[META8]], [[META7]]}
-; CHECK: [[META9]] = distinct !{[[META9]], [[META7]]}
-; CHECK: [[META10]] = distinct !{[[META10]], [[META11:![0-9]+]]}
-; CHECK: [[META11]] = distinct !{[[META11]]}
-; CHECK: [[META12]] = distinct !{[[META12]], [[META11]]}
-; CHECK: [[META13]] = distinct !{[[META13]], [[META11]]}
-; CHECK: [[META14]] = distinct !{[[META14]], [[META15:![0-9]+]]}
-; CHECK: [[META15]] = distinct !{[[META15]]}
-; CHECK: [[META16]] = distinct !{[[META16]], [[META15]]}
-; CHECK: [[META17]] = distinct !{[[META17]], [[META15]]}
-; CHECK: [[META18]] = !{[[META19:![0-9]+]], [[META2]], [[META5]], [[META20:![0-9]+]], [[META6]], [[META9]], [[META21:![0-9]+]], [[META10]], [[META13]], [[META22:![0-9]+]], [[META14]], [[META17]]}
-; CHECK: [[META19]] = distinct !{[[META19]], [[META3]]}
-; CHECK: [[META20]] = distinct !{[[META20]], [[META7]]}
-; CHECK: [[META21]] = distinct !{[[META21]], [[META11]]}
-; CHECK: [[META22]] = distinct !{[[META22]], [[META15]]}
-; CHECK: [[META23]] = !{[[META19]], [[META4]], [[META5]], [[META20]], [[META8]], [[META9]], [[META21]], [[META12]], [[META13]], [[META22]], [[META16]], [[META17]]}
-; CHECK: [[META24]] = !{[[META10]], [[META12]], [[META13]], [[META14]], [[META16]], [[META17]]}
-; CHECK: [[META25]] = !{[[META19]], [[META2]], [[META4]], [[META20]], [[META6]], [[META8]], [[META21]], [[META10]], [[META12]], [[META22]], [[META14]], [[META16]]}
-; CHECK: [[META26]] = !{[[META22]]}
-; CHECK: [[META27]] = !{[[META14]], [[META16]], [[META17]]}
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-lds-size"="12" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-lds-size"="16" }
+; CHECK: attributes #[[ATTR2:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
index b6f70fa..12212a0 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-hybrid.ll
@@ -84,8 +84,8 @@ define void @f2() {
; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+4
; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+12
; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 2
-; GCN-NEXT: s_add_u32 s4, s4, s6
-; GCN-NEXT: s_addc_u32 s5, s5, s7
+; GCN-NEXT: s_add_u32 s4, s6, s4
+; GCN-NEXT: s_addc_u32 s5, s7, s5
; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s4
diff --git a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
index c316f03..b689e1e 100644
--- a/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
+++ b/llvm/test/CodeGen/AMDGPU/lower-module-lds-via-table.ll
@@ -49,8 +49,8 @@ define void @f0() {
; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+4
; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+12
; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 4
-; GCN-NEXT: s_add_u32 s4, s4, s6
-; GCN-NEXT: s_addc_u32 s5, s5, s7
+; GCN-NEXT: s_add_u32 s4, s6, s4
+; GCN-NEXT: s_addc_u32 s5, s7, s5
; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
@@ -90,8 +90,8 @@ define void @f1() {
; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+8
; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+16
; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 4
-; GCN-NEXT: s_add_u32 s4, s4, s6
-; GCN-NEXT: s_addc_u32 s5, s5, s7
+; GCN-NEXT: s_add_u32 s4, s6, s4
+; GCN-NEXT: s_addc_u32 s5, s7, s5
; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
@@ -131,8 +131,8 @@ define void @f2() {
; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+12
; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+20
; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 4
-; GCN-NEXT: s_add_u32 s4, s4, s6
-; GCN-NEXT: s_addc_u32 s5, s5, s7
+; GCN-NEXT: s_add_u32 s4, s6, s4
+; GCN-NEXT: s_addc_u32 s5, s7, s5
; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v2, s4
@@ -172,8 +172,8 @@ define void @f3() {
; GCN-NEXT: s_add_u32 s6, s6, llvm.amdgcn.lds.offset.table@rel32@lo+16
; GCN-NEXT: s_addc_u32 s7, s7, llvm.amdgcn.lds.offset.table@rel32@hi+24
; GCN-NEXT: s_lshl_b64 s[4:5], s[4:5], 4
-; GCN-NEXT: s_add_u32 s4, s4, s6
-; GCN-NEXT: s_addc_u32 s5, s5, s7
+; GCN-NEXT: s_add_u32 s4, s6, s4
+; GCN-NEXT: s_addc_u32 s5, s7, s5
; GCN-NEXT: s_load_dword s4, s[4:5], 0x0
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: v_mov_b32_e32 v0, s4
diff --git a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
index 65b4d37..93d772f 100644
--- a/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
+++ b/llvm/test/CodeGen/AMDGPU/match-perm-extract-vector-elt-bug.ll
@@ -13,9 +13,9 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) {
; GFX9-NEXT: s_and_b32 s4, s4, 0xffff
; GFX9-NEXT: s_mul_i32 s14, s14, s4
; GFX9-NEXT: s_add_i32 s5, s5, s14
-; GFX9-NEXT: v_add_u32_e32 v0, s5, v0
-; GFX9-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX9-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1]
+; GFX9-NEXT: v_add_u32_e32 v1, s5, v0
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: v_ashrrev_i64 v[4:5], 28, v[0:1]
; GFX9-NEXT: v_mov_b32_e32 v1, s1
; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, s0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, v1, v5, vcc
@@ -37,12 +37,12 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) {
; GFX10-NEXT: s_load_dword s4, s[8:9], 0x1c
; GFX10-NEXT: s_load_dword s5, s[8:9], 0x38
; GFX10-NEXT: s_load_dwordx4 s[0:3], s[8:9], 0x0
+; GFX10-NEXT: v_mov_b32_e32 v1, 0
; GFX10-NEXT: s_waitcnt lgkmcnt(0)
; GFX10-NEXT: s_and_b32 s4, s4, 0xffff
; GFX10-NEXT: s_mul_i32 s14, s14, s4
-; GFX10-NEXT: v_add3_u32 v0, s5, s14, v0
-; GFX10-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX10-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1]
+; GFX10-NEXT: v_add3_u32 v2, s5, s14, v0
+; GFX10-NEXT: v_ashrrev_i64 v[4:5], 28, v[1:2]
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, s0, v4
; GFX10-NEXT: v_add_co_ci_u32_e64 v1, null, s1, v5, vcc_lo
; GFX10-NEXT: v_add_co_u32 v4, vcc_lo, s2, v4
@@ -62,21 +62,19 @@ define amdgpu_kernel void @test(ptr addrspace(1) %src, ptr addrspace(1) %dst) {
; GFX11-NEXT: s_load_b32 s6, s[4:5], 0x1c
; GFX11-NEXT: s_load_b32 s7, s[4:5], 0x38
; GFX11-NEXT: s_load_b128 s[0:3], s[4:5], 0x0
-; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v0
+; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_and_b32 v1, 0x3ff, v0
; GFX11-NEXT: s_waitcnt lgkmcnt(0)
; GFX11-NEXT: s_and_b32 s4, s6, 0xffff
; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
; GFX11-NEXT: s_mul_i32 s13, s13, s4
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT: v_add3_u32 v0, s7, s13, v0
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; GFX11-NEXT: v_lshlrev_b64 v[4:5], 4, v[0:1]
+; GFX11-NEXT: v_add3_u32 v1, s7, s13, v1
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-NEXT: v_ashrrev_i64 v[4:5], 28, v[0:1]
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, s0, v4
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s1, v5, vcc_lo
; GFX11-NEXT: v_add_co_u32 v4, vcc_lo, s2, v4
-; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11-NEXT: v_add_co_ci_u32_e64 v5, null, s3, v5, vcc_lo
; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off
; GFX11-NEXT: s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
index dd5c247..14b0729 100644
--- a/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/memmove-var-size.ll
@@ -388,8 +388,8 @@ define void @memmove_p0_p3(ptr addrspace(0) align 1 %dst, ptr addrspace(3) align
; CHECK-NEXT: s_and_saveexec_b32 s7, s4
; CHECK-NEXT: s_cbranch_execz .LBB2_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
-; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v0
-; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v1, s4
+; CHECK-NEXT: v_add_co_u32 v9, s4, v0, v3
+; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v1, v4, s4
; CHECK-NEXT: v_add3_u32 v4, v3, v2, -1
; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4
@@ -684,8 +684,8 @@ define void @memmove_p0_p5(ptr addrspace(0) align 1 %dst, ptr addrspace(5) align
; CHECK-NEXT: s_and_saveexec_b32 s7, s4
; CHECK-NEXT: s_cbranch_execz .LBB4_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
-; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v0
-; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v1, s4
+; CHECK-NEXT: v_add_co_u32 v9, s4, v0, v3
+; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v1, v4, s4
; CHECK-NEXT: v_add3_u32 v4, v3, v2, -1
; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4
@@ -1411,8 +1411,8 @@ define void @memmove_p3_p0(ptr addrspace(3) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: s_and_saveexec_b32 s7, s4
; CHECK-NEXT: s_cbranch_execz .LBB10_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
-; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v2, s4
+; CHECK-NEXT: v_add_co_u32 v9, s4, v1, v3
+; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v2, v4, s4
; CHECK-NEXT: v_add3_u32 v4, v3, v0, -1
; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4
@@ -1889,8 +1889,8 @@ define void @memmove_p5_p0(ptr addrspace(5) align 1 %dst, ptr addrspace(0) align
; CHECK-NEXT: s_and_saveexec_b32 s7, s4
; CHECK-NEXT: s_cbranch_execz .LBB15_13
; CHECK-NEXT: ; %bb.11: ; %memmove_bwd_residual_loop.preheader
-; CHECK-NEXT: v_add_co_u32 v9, s4, v3, v1
-; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v4, v2, s4
+; CHECK-NEXT: v_add_co_u32 v9, s4, v1, v3
+; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, v2, v4, s4
; CHECK-NEXT: v_add3_u32 v4, v3, v0, -1
; CHECK-NEXT: v_add_co_u32 v9, s4, v9, -1
; CHECK-NEXT: v_add_co_ci_u32_e64 v10, null, -1, v10, s4
diff --git a/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll
index 6d0aa1e..7e4be65 100644
--- a/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-folding-imm-to-inst-with-fi.ll
@@ -9,92 +9,65 @@ define protected amdgpu_kernel void @no_folding_imm_to_inst_with_fi(<4 x i64> %v
; CHECK-NEXT: s_load_b512 s[16:31], s[4:5], 0xe4
; CHECK-NEXT: s_load_b512 s[0:15], s[4:5], 0xa4
; CHECK-NEXT: s_mov_b64 s[34:35], src_private_base
-; CHECK-NEXT: s_movk_i32 s33, 0x70
-; CHECK-NEXT: s_movk_i32 s34, 0x60
-; CHECK-NEXT: s_or_b32 s44, 0x80, s33
-; CHECK-NEXT: s_mov_b32 s45, s35
-; CHECK-NEXT: s_or_b32 s46, 0x80, s34
-; CHECK-NEXT: s_mov_b32 s47, s35
-; CHECK-NEXT: v_dual_mov_b32 v20, s44 :: v_dual_mov_b32 v21, s45
-; CHECK-NEXT: v_dual_mov_b32 v22, s46 :: v_dual_mov_b32 v23, s47
; CHECK-NEXT: s_movk_i32 s34, 0x80
; CHECK-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
-; CHECK-NEXT: v_dual_mov_b32 v34, s34 :: v_dual_mov_b32 v35, s35
+; CHECK-NEXT: v_dual_mov_b32 v20, s34 :: v_dual_mov_b32 v21, s35
; CHECK-NEXT: s_wait_kmcnt 0x0
; CHECK-NEXT: v_dual_mov_b32 v0, s40 :: v_dual_mov_b32 v1, s41
; CHECK-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
; CHECK-NEXT: v_dual_mov_b32 v4, s36 :: v_dual_mov_b32 v5, s37
; CHECK-NEXT: v_dual_mov_b32 v6, s38 :: v_dual_mov_b32 v7, s39
-; CHECK-NEXT: scratch_store_b128 off, v[0:3], off offset:16 scope:SCOPE_SYS
-; CHECK-NEXT: s_wait_storecnt 0x0
-; CHECK-NEXT: v_dual_mov_b32 v0, s20 :: v_dual_mov_b32 v1, s21
-; CHECK-NEXT: s_movk_i32 s20, 0x50
; CHECK-NEXT: v_dual_mov_b32 v8, s28 :: v_dual_mov_b32 v9, s29
; CHECK-NEXT: v_dual_mov_b32 v10, s30 :: v_dual_mov_b32 v11, s31
-; CHECK-NEXT: s_wait_alu 0xfffe
-; CHECK-NEXT: s_or_b32 s20, 0x80, s20
-; CHECK-NEXT: s_mov_b32 s21, s35
; CHECK-NEXT: v_dual_mov_b32 v12, s24 :: v_dual_mov_b32 v13, s25
; CHECK-NEXT: v_dual_mov_b32 v14, s26 :: v_dual_mov_b32 v15, s27
-; CHECK-NEXT: v_dual_mov_b32 v2, s22 :: v_dual_mov_b32 v3, s23
-; CHECK-NEXT: s_wait_alu 0xfffe
-; CHECK-NEXT: v_dual_mov_b32 v25, s21 :: v_dual_mov_b32 v24, s20
+; CHECK-NEXT: v_dual_mov_b32 v16, s20 :: v_dual_mov_b32 v17, s21
+; CHECK-NEXT: v_dual_mov_b32 v18, s22 :: v_dual_mov_b32 v19, s23
+; CHECK-NEXT: scratch_store_b128 off, v[0:3], off offset:16 scope:SCOPE_SYS
+; CHECK-NEXT: s_wait_storecnt 0x0
; CHECK-NEXT: scratch_store_b128 off, v[4:7], off scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
-; CHECK-NEXT: flat_store_b128 v[20:21], v[8:11] scope:SCOPE_SYS
+; CHECK-NEXT: flat_store_b128 v[20:21], v[8:11] offset:112 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
-; CHECK-NEXT: flat_store_b128 v[22:23], v[12:15] scope:SCOPE_SYS
+; CHECK-NEXT: flat_store_b128 v[20:21], v[12:15] offset:96 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
-; CHECK-NEXT: flat_store_b128 v[24:25], v[0:3] scope:SCOPE_SYS
+; CHECK-NEXT: flat_store_b128 v[20:21], v[16:19] offset:80 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
; CHECK-NEXT: v_dual_mov_b32 v0, s16 :: v_dual_mov_b32 v1, s17
-; CHECK-NEXT: s_or_b32 s16, 0x80, 64
-; CHECK-NEXT: s_mov_b32 s17, s35
-; CHECK-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13
-; CHECK-NEXT: s_or_b32 s12, 0x80, 48
-; CHECK-NEXT: s_mov_b32 s13, s35
-; CHECK-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
-; CHECK-NEXT: s_or_b32 s8, 0x80, 32
-; CHECK-NEXT: s_mov_b32 s9, s35
-; CHECK-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5
-; CHECK-NEXT: s_or_b32 s4, 0x80, 16
-; CHECK-NEXT: s_mov_b32 s5, s35
; CHECK-NEXT: v_dual_mov_b32 v2, s18 :: v_dual_mov_b32 v3, s19
-; CHECK-NEXT: s_wait_alu 0xfffe
-; CHECK-NEXT: v_dual_mov_b32 v27, s17 :: v_dual_mov_b32 v26, s16
+; CHECK-NEXT: v_dual_mov_b32 v4, s12 :: v_dual_mov_b32 v5, s13
; CHECK-NEXT: v_dual_mov_b32 v6, s14 :: v_dual_mov_b32 v7, s15
-; CHECK-NEXT: v_dual_mov_b32 v29, s13 :: v_dual_mov_b32 v28, s12
-; CHECK-NEXT: v_dual_mov_b32 v31, s9 :: v_dual_mov_b32 v30, s8
-; CHECK-NEXT: v_dual_mov_b32 v33, s5 :: v_dual_mov_b32 v32, s4
+; CHECK-NEXT: v_dual_mov_b32 v8, s8 :: v_dual_mov_b32 v9, s9
; CHECK-NEXT: v_dual_mov_b32 v10, s10 :: v_dual_mov_b32 v11, s11
+; CHECK-NEXT: v_dual_mov_b32 v12, s4 :: v_dual_mov_b32 v13, s5
; CHECK-NEXT: v_dual_mov_b32 v14, s6 :: v_dual_mov_b32 v15, s7
; CHECK-NEXT: v_dual_mov_b32 v16, s0 :: v_dual_mov_b32 v17, s1
; CHECK-NEXT: v_dual_mov_b32 v18, s2 :: v_dual_mov_b32 v19, s3
-; CHECK-NEXT: flat_store_b128 v[26:27], v[0:3] scope:SCOPE_SYS
+; CHECK-NEXT: flat_store_b128 v[20:21], v[0:3] offset:64 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
-; CHECK-NEXT: flat_store_b128 v[28:29], v[4:7] scope:SCOPE_SYS
+; CHECK-NEXT: flat_store_b128 v[20:21], v[4:7] offset:48 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
-; CHECK-NEXT: flat_store_b128 v[30:31], v[8:11] scope:SCOPE_SYS
+; CHECK-NEXT: flat_store_b128 v[20:21], v[8:11] offset:32 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
-; CHECK-NEXT: flat_store_b128 v[32:33], v[12:15] scope:SCOPE_SYS
+; CHECK-NEXT: flat_store_b128 v[20:21], v[12:15] offset:16 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
-; CHECK-NEXT: flat_store_b128 v[34:35], v[16:19] scope:SCOPE_SYS
+; CHECK-NEXT: flat_store_b128 v[20:21], v[16:19] scope:SCOPE_SYS
; CHECK-NEXT: s_wait_storecnt 0x0
-; CHECK-NEXT: flat_load_b128 v[0:3], v[22:23] scope:SCOPE_SYS
+; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:96 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
-; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] scope:SCOPE_SYS
+; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:112 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
-; CHECK-NEXT: flat_load_b128 v[0:3], v[26:27] scope:SCOPE_SYS
+; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:64 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
-; CHECK-NEXT: flat_load_b128 v[0:3], v[24:25] scope:SCOPE_SYS
+; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:80 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
-; CHECK-NEXT: flat_load_b128 v[0:3], v[30:31] scope:SCOPE_SYS
+; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:32 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
-; CHECK-NEXT: flat_load_b128 v[0:3], v[28:29] scope:SCOPE_SYS
+; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:48 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
-; CHECK-NEXT: flat_load_b128 v[0:3], v[34:35] scope:SCOPE_SYS
+; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt_dscnt 0x0
-; CHECK-NEXT: flat_load_b128 v[0:3], v[32:33] scope:SCOPE_SYS
+; CHECK-NEXT: flat_load_b128 v[0:3], v[20:21] offset:16 scope:SCOPE_SYS
; CHECK-NEXT: s_wait_loadcnt 0x0
; CHECK-NEXT: s_endpgm
bb:
diff --git a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
index f5e136a..b717f85 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-implicit-kernargs.ll
@@ -337,8 +337,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
; GFX942-NEXT: .p2align 8
; GFX942-NEXT: ; %bb.2:
; GFX942-NEXT: .LBB8_0:
-; GFX942-NEXT: s_mov_b32 s4, 8
-; GFX942-NEXT: s_load_dword s0, s[0:1], s4 offset:0x2
+; GFX942-NEXT: s_load_dword s0, s[0:1], 0xa
; GFX942-NEXT: v_mov_b32_e32 v0, 0
; GFX942-NEXT: s_waitcnt lgkmcnt(0)
; GFX942-NEXT: v_mov_b32_e32 v1, s0
@@ -353,8 +352,7 @@ define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out)
; GFX90a-NEXT: .p2align 8
; GFX90a-NEXT: ; %bb.2:
; GFX90a-NEXT: .LBB8_0:
-; GFX90a-NEXT: s_mov_b32 s0, 8
-; GFX90a-NEXT: s_load_dword s0, s[4:5], s0 offset:0x2
+; GFX90a-NEXT: s_load_dword s0, s[4:5], 0xa
; GFX90a-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-NEXT: s_waitcnt lgkmcnt(0)
; GFX90a-NEXT: v_mov_b32_e32 v1, s0
diff --git a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
index 760a298..85a9aba 100644
--- a/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
+++ b/llvm/test/CodeGen/AMDGPU/promote-constOffset-to-imm.ll
@@ -608,8 +608,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX10-NEXT: v_mov_b32_e32 v7, 0x7f
; GFX10-NEXT: v_and_b32_e32 v6, 0xfe000000, v1
; GFX10-NEXT: v_lshl_or_b32 v0, v0, 3, v6
-; GFX10-NEXT: v_add_co_u32 v0, s0, v0, s34
-; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0, s35, s0
+; GFX10-NEXT: v_add_co_u32 v0, s0, s34, v0
+; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, s35, 0, s0
; GFX10-NEXT: v_add_co_u32 v0, vcc_lo, 0x5000, v0
; GFX10-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
; GFX10-NEXT: .LBB1_1: ; %for.cond.preheader
@@ -819,8 +819,8 @@ define hidden amdgpu_kernel void @clmem_read(ptr addrspace(1) %buffer) {
; GFX11-NEXT: v_and_b32_e32 v6, 0xfe000000, v1
; GFX11-NEXT: v_lshl_or_b32 v0, v0, 3, v6
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT: v_add_co_u32 v0, s0, v0, s34
-; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, s35, s0
+; GFX11-NEXT: v_add_co_u32 v0, s0, s34, v0
+; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s35, 0, s0
; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, 0x5000, v0
; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, 0, v1, vcc_lo
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll
index ff90f1f..40f39a2 100644
--- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-mubuf.ll
@@ -1,6 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX6,GFX6_PTRADD %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX6,GFX6_LEGACY %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=tahiti < %s | FileCheck --check-prefixes=GFX6 %s
; Test PTRADD handling in AMDGPUDAGToDAGISel::SelectMUBUF.
@@ -34,7 +33,3 @@ define amdgpu_kernel void @v_add_i32(ptr addrspace(1) %out, ptr addrspace(1) %in
store i32 %result, ptr addrspace(1) %out
ret void
}
-
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX6_LEGACY: {{.*}}
-; GFX6_PTRADD: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
index 7d3b19e..1c986a0 100644
--- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-optimizations.ll
@@ -1,6 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -disable-separate-const-offset-from-gep=1 < %s | FileCheck --check-prefixes=GFX942 %s
; Tests for DAG combines and folds related to the ISD::PTRADD SelectionDAG
; opcode. The RUN lines uses -disable-separate-const-offset-from-gep to disable
@@ -24,21 +23,13 @@ define i64 @global_load_ZTwoUses(ptr addrspace(1) %base, i64 %voffset) {
}
define i64 @global_load_gep_add_reassoc(ptr addrspace(1) %base, i64 %voffset) {
-; GFX942_PTRADD-LABEL: global_load_gep_add_reassoc:
-; GFX942_PTRADD: ; %bb.0:
-; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
-; GFX942_PTRADD-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24
-; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0)
-; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942_LEGACY-LABEL: global_load_gep_add_reassoc:
-; GFX942_LEGACY: ; %bb.0:
-; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[2:3], 0, v[0:1]
-; GFX942_LEGACY-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24
-; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0)
-; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: global_load_gep_add_reassoc:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, v[2:3]
+; GFX942-NEXT: global_load_dwordx2 v[0:1], v[0:1], off offset:24
+; GFX942-NEXT: s_waitcnt vmcnt(0)
+; GFX942-NEXT: s_setpc_b64 s[30:31]
%add0 = add nuw nsw i64 %voffset, 24
%gep0 = getelementptr nuw inbounds i8, ptr addrspace(1) %base, i64 %add0
%l = load i64, ptr addrspace(1) %gep0, align 8
@@ -221,23 +212,14 @@ define ptr addrspace(1) @shl_neg_offset(ptr addrspace(1) %p, i64 %noffset, i64 %
; Check that offsets are folded into global addresses if possible. For example,
; this is relevant when using --amdgpu-lower-module-lds-strategy=table.
define ptr addrspace(1) @complextype_global_gep(i64 %offset) {
-; GFX942_PTRADD-LABEL: complextype_global_gep:
-; GFX942_PTRADD: ; %bb.0:
-; GFX942_PTRADD-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_PTRADD-NEXT: s_getpc_b64 s[0:1]
-; GFX942_PTRADD-NEXT: s_add_u32 s0, s0, v0@rel32@lo+14
-; GFX942_PTRADD-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+22
-; GFX942_PTRADD-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
-; GFX942_PTRADD-NEXT: s_setpc_b64 s[30:31]
-;
-; GFX942_LEGACY-LABEL: complextype_global_gep:
-; GFX942_LEGACY: ; %bb.0:
-; GFX942_LEGACY-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX942_LEGACY-NEXT: s_getpc_b64 s[0:1]
-; GFX942_LEGACY-NEXT: s_add_u32 s0, s0, v0@rel32@lo+14
-; GFX942_LEGACY-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+22
-; GFX942_LEGACY-NEXT: v_lshl_add_u64 v[0:1], v[0:1], 0, s[0:1]
-; GFX942_LEGACY-NEXT: s_setpc_b64 s[30:31]
+; GFX942-LABEL: complextype_global_gep:
+; GFX942: ; %bb.0:
+; GFX942-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX942-NEXT: s_getpc_b64 s[0:1]
+; GFX942-NEXT: s_add_u32 s0, s0, v0@rel32@lo+14
+; GFX942-NEXT: s_addc_u32 s1, s1, v0@rel32@hi+22
+; GFX942-NEXT: v_lshl_add_u64 v[0:1], s[0:1], 0, v[0:1]
+; GFX942-NEXT: s_setpc_b64 s[30:31]
%gep0 = getelementptr inbounds %complextype, ptr addrspace(1) @v0, i64 0, i32 1, i64 %offset
%gep1 = getelementptr inbounds i8, ptr addrspace(1) %gep0, i64 2
ret ptr addrspace(1) %gep1
@@ -430,36 +412,20 @@ define ptr @gep_disjoint_or(ptr %base) {
; Check that AssertAlign nodes between ptradd nodes don't block offset folding,
; taken from preload-implicit-kernargs.ll
define amdgpu_kernel void @random_incorrect_offset(ptr addrspace(1) inreg %out) {
-; GFX942_PTRADD-LABEL: random_incorrect_offset:
-; GFX942_PTRADD: ; %bb.1:
-; GFX942_PTRADD-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_PTRADD-NEXT: s_branch .LBB21_0
-; GFX942_PTRADD-NEXT: .p2align 8
-; GFX942_PTRADD-NEXT: ; %bb.2:
-; GFX942_PTRADD-NEXT: .LBB21_0:
-; GFX942_PTRADD-NEXT: s_load_dword s0, s[4:5], 0xa
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v0, 0
-; GFX942_PTRADD-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_PTRADD-NEXT: v_mov_b32_e32 v1, s0
-; GFX942_PTRADD-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX942_PTRADD-NEXT: s_endpgm
-;
-; GFX942_LEGACY-LABEL: random_incorrect_offset:
-; GFX942_LEGACY: ; %bb.1:
-; GFX942_LEGACY-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
-; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_LEGACY-NEXT: s_branch .LBB21_0
-; GFX942_LEGACY-NEXT: .p2align 8
-; GFX942_LEGACY-NEXT: ; %bb.2:
-; GFX942_LEGACY-NEXT: .LBB21_0:
-; GFX942_LEGACY-NEXT: s_mov_b32 s0, 8
-; GFX942_LEGACY-NEXT: s_load_dword s0, s[4:5], s0 offset:0x2
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v0, 0
-; GFX942_LEGACY-NEXT: s_waitcnt lgkmcnt(0)
-; GFX942_LEGACY-NEXT: v_mov_b32_e32 v1, s0
-; GFX942_LEGACY-NEXT: global_store_dword v0, v1, s[8:9]
-; GFX942_LEGACY-NEXT: s_endpgm
+; GFX942-LABEL: random_incorrect_offset:
+; GFX942: ; %bb.1:
+; GFX942-NEXT: s_load_dwordx2 s[8:9], s[4:5], 0x0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: s_branch .LBB21_0
+; GFX942-NEXT: .p2align 8
+; GFX942-NEXT: ; %bb.2:
+; GFX942-NEXT: .LBB21_0:
+; GFX942-NEXT: s_load_dword s0, s[4:5], 0xa
+; GFX942-NEXT: v_mov_b32_e32 v0, 0
+; GFX942-NEXT: s_waitcnt lgkmcnt(0)
+; GFX942-NEXT: v_mov_b32_e32 v1, s0
+; GFX942-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX942-NEXT: s_endpgm
%imp_arg_ptr = call ptr addrspace(4) @llvm.amdgcn.implicitarg.ptr()
%gep = getelementptr i8, ptr addrspace(4) %imp_arg_ptr, i32 2
%load = load i32, ptr addrspace(4) %gep
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll
index 1934ce3..e7c715f 100644
--- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag-undef-poison.ll
@@ -1,6 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=amdgpu-isel -amdgpu-use-sdag-ptradd=1 < %s | FileCheck --check-prefixes=GFX942,GFX942_PTRADD %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=amdgpu-isel -amdgpu-use-sdag-ptradd=0 < %s | FileCheck --check-prefixes=GFX942,GFX942_LEGACY %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -start-before=amdgpu-isel < %s | FileCheck --check-prefixes=GFX942 %s
; Tests for undef and poison DAG folds for the ISD::PTRADD SelectionDAG opcode.
; If any additions are generated for these tests, the folds don't work.
@@ -44,6 +43,3 @@ define ptr @undef_base(ptr %p, i64 %offset) {
%gep1 = getelementptr i8, ptr undef, i64 %offset
ret ptr %gep1
}
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX942_LEGACY: {{.*}}
-; GFX942_PTRADD: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll
index 9dd2502..f4f5a78 100644
--- a/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll
+++ b/llvm/test/CodeGen/AMDGPU/ptradd-sdag.ll
@@ -1,14 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX8,GFX8_PTRADD
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX8,GFX8_LEGACY
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX942,GFX942_PTRADD
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX942,GFX942_LEGACY
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX10,GFX10_PTRADD
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX10,GFX10_LEGACY
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX11,GFX11_PTRADD
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX11,GFX11_LEGACY
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-use-sdag-ptradd=1 < %s | FileCheck %s -check-prefixes=GFX12,GFX12_PTRADD
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -amdgpu-use-sdag-ptradd=0 < %s | FileCheck %s -check-prefixes=GFX12,GFX12_LEGACY
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji < %s | FileCheck %s -check-prefixes=GFX8
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx942 < %s | FileCheck %s -check-prefixes=GFX942
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1030 < %s | FileCheck %s -check-prefixes=GFX10
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck %s -check-prefixes=GFX11
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 < %s | FileCheck %s -check-prefixes=GFX12
; Tests for the ISD::PTRADD SelectionDAG opcode. This only tests 64-bit address
; spaces since PTRADD is currently only used for these.
@@ -511,15 +506,3 @@ entry:
store i32 %val, ptr addrspace(1) %gep.to, align 4
ret void
}
-
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX10_LEGACY: {{.*}}
-; GFX10_PTRADD: {{.*}}
-; GFX11_LEGACY: {{.*}}
-; GFX11_PTRADD: {{.*}}
-; GFX12_LEGACY: {{.*}}
-; GFX12_PTRADD: {{.*}}
-; GFX8_LEGACY: {{.*}}
-; GFX8_PTRADD: {{.*}}
-; GFX942_LEGACY: {{.*}}
-; GFX942_PTRADD: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
index 65a99d0..480eb0d 100644
--- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
+++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll
@@ -52,11 +52,12 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 {
; HAWAII-LABEL: local_store_i55:
; HAWAII: ; %bb.0:
; HAWAII-NEXT: s_add_i32 s12, s12, s17
-; HAWAII-NEXT: s_or_b32 s0, s8, 14
-; HAWAII-NEXT: s_mov_b32 flat_scratch_lo, s13
; HAWAII-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
+; HAWAII-NEXT: s_add_u32 s0, s8, 14
+; HAWAII-NEXT: s_addc_u32 s1, s9, 0
; HAWAII-NEXT: v_mov_b32_e32 v0, s0
-; HAWAII-NEXT: v_mov_b32_e32 v1, s9
+; HAWAII-NEXT: s_mov_b32 flat_scratch_lo, s13
+; HAWAII-NEXT: v_mov_b32_e32 v1, s1
; HAWAII-NEXT: flat_load_ubyte v0, v[0:1]
; HAWAII-NEXT: s_load_dword s2, s[8:9], 0x0
; HAWAII-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x2
@@ -74,25 +75,27 @@ define amdgpu_kernel void @local_store_i55(ptr addrspace(3) %ptr, i55 %arg) #0 {
;
; FIJI-LABEL: local_store_i55:
; FIJI: ; %bb.0:
+; FIJI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
; FIJI-NEXT: s_add_i32 s12, s12, s17
-; FIJI-NEXT: s_or_b32 s0, s8, 14
-; FIJI-NEXT: s_mov_b32 flat_scratch_lo, s13
; FIJI-NEXT: s_lshr_b32 flat_scratch_hi, s12, 8
-; FIJI-NEXT: v_mov_b32_e32 v0, s0
-; FIJI-NEXT: v_mov_b32_e32 v1, s9
-; FIJI-NEXT: flat_load_ubyte v0, v[0:1]
-; FIJI-NEXT: s_load_dwordx2 s[0:1], s[8:9], 0x8
-; FIJI-NEXT: s_load_dword s2, s[8:9], 0x0
+; FIJI-NEXT: s_mov_b32 flat_scratch_lo, s13
; FIJI-NEXT: s_mov_b32 m0, -1
; FIJI-NEXT: s_waitcnt lgkmcnt(0)
-; FIJI-NEXT: s_and_b32 s3, s1, 0xffff
-; FIJI-NEXT: v_mov_b32_e32 v1, s2
+; FIJI-NEXT: s_and_b32 s4, s1, 0xffff
+; FIJI-NEXT: s_add_u32 s2, s8, 14
+; FIJI-NEXT: s_addc_u32 s3, s9, 0
+; FIJI-NEXT: v_mov_b32_e32 v0, s2
+; FIJI-NEXT: v_mov_b32_e32 v1, s3
+; FIJI-NEXT: flat_load_ubyte v0, v[0:1]
+; FIJI-NEXT: s_load_dword s2, s[8:9], 0x0
; FIJI-NEXT: v_mov_b32_e32 v2, s1
; FIJI-NEXT: v_mov_b32_e32 v3, s0
+; FIJI-NEXT: s_waitcnt lgkmcnt(0)
+; FIJI-NEXT: v_mov_b32_e32 v1, s2
; FIJI-NEXT: ds_write_b16 v1, v2 offset:4
; FIJI-NEXT: s_waitcnt vmcnt(0)
; FIJI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; FIJI-NEXT: v_or_b32_e32 v0, s3, v0
+; FIJI-NEXT: v_or_b32_e32 v0, s4, v0
; FIJI-NEXT: v_bfe_u32 v0, v0, 16, 7
; FIJI-NEXT: ds_write_b8 v1, v0 offset:6
; FIJI-NEXT: ds_write_b32 v1, v3
diff --git a/llvm/test/CodeGen/Hexagon/unaligned-vec-store.ll b/llvm/test/CodeGen/Hexagon/unaligned-vec-store.ll
new file mode 100644
index 0000000..267e365
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/unaligned-vec-store.ll
@@ -0,0 +1,23 @@
+; RUN: llc -march=hexagon -mcpu=hexagonv68 -mattr=+hvxv68,+hvx-length128B < %s | FileCheck %s
+; REQUIRES: asserts
+
+; Check that the test does not assert when unaligned vector store V6_vS32Ub_npred_ai is generated.
+; CHECK: if (!p{{[0-3]}}) vmemu
+
+target triple = "hexagon-unknown-unknown-elf"
+
+define fastcc void @test(i1 %cmp.i.i) {
+entry:
+ %call.i.i.i172 = load ptr, ptr null, align 4
+ %add.ptr = getelementptr i8, ptr %call.i.i.i172, i32 1
+ store <32 x i32> zeroinitializer, ptr %add.ptr, align 128
+ %add.ptr4.i4 = getelementptr i8, ptr %call.i.i.i172, i32 129
+ br i1 %cmp.i.i, label %common.ret, label %if.end.i.i
+
+common.ret: ; preds = %if.end.i.i, %entry
+ ret void
+
+if.end.i.i: ; preds = %entry
+ store <32 x i32> zeroinitializer, ptr %add.ptr4.i4, align 1
+ br label %common.ret
+}
diff --git a/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll b/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll
index 18fb879..21ca041 100644
--- a/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll
+++ b/llvm/test/CodeGen/NVPTX/f32x2-convert-i32x2.ll
@@ -115,5 +115,150 @@ define ptx_kernel void @inlineasm(ptr %p) {
store <2 x float> %mul, ptr %p, align 8
ret void
}
+
+define ptx_kernel void @trunc_v2i32(<2 x i32> %0) {
+; CHECK-SM90A-LABEL: trunc_v2i32(
+; CHECK-SM90A: {
+; CHECK-SM90A-NEXT: .reg .b32 %r<7>;
+; CHECK-SM90A-NEXT: .reg .b64 %rd<2>;
+; CHECK-SM90A-EMPTY:
+; CHECK-SM90A-NEXT: // %bb.0:
+; CHECK-SM90A-NEXT: ld.param.v2.b32 {%r1, %r2}, [trunc_v2i32_param_0];
+; CHECK-SM90A-NEXT: prmt.b32 %r3, %r1, %r2, 0x3340U;
+; CHECK-SM90A-NEXT: mov.b32 %r4, 0;
+; CHECK-SM90A-NEXT: prmt.b32 %r5, %r4, 0, 0x3340U;
+; CHECK-SM90A-NEXT: prmt.b32 %r6, %r5, %r3, 0x5410U;
+; CHECK-SM90A-NEXT: mov.b64 %rd1, 0;
+; CHECK-SM90A-NEXT: st.b32 [%rd1], %r6;
+; CHECK-SM90A-NEXT: ret;
+;
+; CHECK-SM100-LABEL: trunc_v2i32(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b32 %r<7>;
+; CHECK-SM100-NEXT: .reg .b64 %rd<3>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.b64 %rd1, [trunc_v2i32_param_0];
+; CHECK-SM100-NEXT: mov.b64 {%r1, %r2}, %rd1;
+; CHECK-SM100-NEXT: mov.b32 %r3, 0;
+; CHECK-SM100-NEXT: prmt.b32 %r4, %r3, 0, 0x3340U;
+; CHECK-SM100-NEXT: prmt.b32 %r5, %r1, %r2, 0x3340U;
+; CHECK-SM100-NEXT: prmt.b32 %r6, %r4, %r5, 0x5410U;
+; CHECK-SM100-NEXT: mov.b64 %rd2, 0;
+; CHECK-SM100-NEXT: st.b32 [%rd2], %r6;
+; CHECK-SM100-NEXT: ret;
+ %2 = trunc <2 x i32> %0 to <2 x i8>
+ %3 = shufflevector <2 x i8> zeroinitializer, <2 x i8> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ store <4 x i8> %3, ptr null, align 4
+ ret void
+}
+
+define ptx_kernel void @zextend_to_v2i32(<2 x i8> %0) {
+; CHECK-SM90A-LABEL: zextend_to_v2i32(
+; CHECK-SM90A: {
+; CHECK-SM90A-NEXT: .reg .b16 %rs<3>;
+; CHECK-SM90A-NEXT: .reg .b32 %r<4>;
+; CHECK-SM90A-NEXT: .reg .b64 %rd<5>;
+; CHECK-SM90A-EMPTY:
+; CHECK-SM90A-NEXT: // %bb.0:
+; CHECK-SM90A-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [zextend_to_v2i32_param_0];
+; CHECK-SM90A-NEXT: mov.b32 %r1, {%rs1, %rs2};
+; CHECK-SM90A-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-SM90A-NEXT: cvt.u32.u16 %r3, %rs2;
+; CHECK-SM90A-NEXT: mov.b64 %rd1, 12;
+; CHECK-SM90A-NEXT: st.b32 [%rd1], %r3;
+; CHECK-SM90A-NEXT: mov.b64 %rd2, 8;
+; CHECK-SM90A-NEXT: st.b32 [%rd2], %r2;
+; CHECK-SM90A-NEXT: mov.b64 %rd3, 4;
+; CHECK-SM90A-NEXT: st.b32 [%rd3], 0;
+; CHECK-SM90A-NEXT: mov.b64 %rd4, 0;
+; CHECK-SM90A-NEXT: st.b32 [%rd4], 0;
+; CHECK-SM90A-NEXT: ret;
+;
+; CHECK-SM100-LABEL: zextend_to_v2i32(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b16 %rs<3>;
+; CHECK-SM100-NEXT: .reg .b32 %r<5>;
+; CHECK-SM100-NEXT: .reg .b64 %rd<8>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [zextend_to_v2i32_param_0];
+; CHECK-SM100-NEXT: mov.b32 %r1, {%rs1, %rs2};
+; CHECK-SM100-NEXT: cvt.u32.u16 %r2, %rs2;
+; CHECK-SM100-NEXT: cvt.u32.u16 %r3, %rs1;
+; CHECK-SM100-NEXT: mov.b64 %rd1, {%r3, %r2};
+; CHECK-SM100-NEXT: mov.b32 %r4, 0;
+; CHECK-SM100-NEXT: mov.b64 %rd2, {%r4, %r4};
+; CHECK-SM100-NEXT: mov.b64 %rd3, 4;
+; CHECK-SM100-NEXT: st.b32 [%rd3], %rd2;
+; CHECK-SM100-NEXT: mov.b64 %rd4, 0;
+; CHECK-SM100-NEXT: st.b32 [%rd4], %rd2;
+; CHECK-SM100-NEXT: mov.b64 %rd5, 8;
+; CHECK-SM100-NEXT: st.b32 [%rd5], %rd1;
+; CHECK-SM100-NEXT: shr.u64 %rd6, %rd1, 32;
+; CHECK-SM100-NEXT: mov.b64 %rd7, 12;
+; CHECK-SM100-NEXT: st.b32 [%rd7], %rd6;
+; CHECK-SM100-NEXT: ret;
+ %2 = zext <2 x i8> %0 to <2 x i32>
+ %3 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ store <4 x i32> %3, ptr null, align 4
+ ret void
+}
+
+define ptx_kernel void @sextend_to_v2i32(<2 x i8> %0) {
+; CHECK-SM90A-LABEL: sextend_to_v2i32(
+; CHECK-SM90A: {
+; CHECK-SM90A-NEXT: .reg .b16 %rs<3>;
+; CHECK-SM90A-NEXT: .reg .b32 %r<6>;
+; CHECK-SM90A-NEXT: .reg .b64 %rd<5>;
+; CHECK-SM90A-EMPTY:
+; CHECK-SM90A-NEXT: // %bb.0:
+; CHECK-SM90A-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [sextend_to_v2i32_param_0];
+; CHECK-SM90A-NEXT: mov.b32 %r1, {%rs1, %rs2};
+; CHECK-SM90A-NEXT: cvt.u32.u16 %r2, %rs1;
+; CHECK-SM90A-NEXT: cvt.s32.s8 %r3, %r2;
+; CHECK-SM90A-NEXT: cvt.u32.u16 %r4, %rs2;
+; CHECK-SM90A-NEXT: cvt.s32.s8 %r5, %r4;
+; CHECK-SM90A-NEXT: mov.b64 %rd1, 12;
+; CHECK-SM90A-NEXT: st.b32 [%rd1], %r5;
+; CHECK-SM90A-NEXT: mov.b64 %rd2, 8;
+; CHECK-SM90A-NEXT: st.b32 [%rd2], %r3;
+; CHECK-SM90A-NEXT: mov.b64 %rd3, 4;
+; CHECK-SM90A-NEXT: st.b32 [%rd3], 0;
+; CHECK-SM90A-NEXT: mov.b64 %rd4, 0;
+; CHECK-SM90A-NEXT: st.b32 [%rd4], 0;
+; CHECK-SM90A-NEXT: ret;
+;
+; CHECK-SM100-LABEL: sextend_to_v2i32(
+; CHECK-SM100: {
+; CHECK-SM100-NEXT: .reg .b16 %rs<3>;
+; CHECK-SM100-NEXT: .reg .b32 %r<7>;
+; CHECK-SM100-NEXT: .reg .b64 %rd<8>;
+; CHECK-SM100-EMPTY:
+; CHECK-SM100-NEXT: // %bb.0:
+; CHECK-SM100-NEXT: ld.param.v2.b8 {%rs1, %rs2}, [sextend_to_v2i32_param_0];
+; CHECK-SM100-NEXT: mov.b32 %r1, {%rs1, %rs2};
+; CHECK-SM100-NEXT: cvt.u32.u16 %r2, %rs2;
+; CHECK-SM100-NEXT: cvt.s32.s8 %r3, %r2;
+; CHECK-SM100-NEXT: cvt.u32.u16 %r4, %rs1;
+; CHECK-SM100-NEXT: cvt.s32.s8 %r5, %r4;
+; CHECK-SM100-NEXT: mov.b64 %rd1, {%r5, %r3};
+; CHECK-SM100-NEXT: mov.b32 %r6, 0;
+; CHECK-SM100-NEXT: mov.b64 %rd2, {%r6, %r6};
+; CHECK-SM100-NEXT: mov.b64 %rd3, 4;
+; CHECK-SM100-NEXT: st.b32 [%rd3], %rd2;
+; CHECK-SM100-NEXT: mov.b64 %rd4, 0;
+; CHECK-SM100-NEXT: st.b32 [%rd4], %rd2;
+; CHECK-SM100-NEXT: mov.b64 %rd5, 8;
+; CHECK-SM100-NEXT: st.b32 [%rd5], %rd1;
+; CHECK-SM100-NEXT: shr.u64 %rd6, %rd1, 32;
+; CHECK-SM100-NEXT: mov.b64 %rd7, 12;
+; CHECK-SM100-NEXT: st.b32 [%rd7], %rd6;
+; CHECK-SM100-NEXT: ret;
+ %2 = sext <2 x i8> %0 to <2 x i32>
+ %3 = shufflevector <2 x i32> zeroinitializer, <2 x i32> %2, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ store <4 x i32> %3, ptr null, align 4
+ ret void
+}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
index 7204064..f1d17f9f 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/legalizer-info-validation.mir
@@ -505,6 +505,9 @@
# DEBUG-NEXT: G_FREM (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. the first uncovered type index: 1, OK
# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
+# DEBUG-NEXT: G_FMODF (opcode {{[0-9]+}}): 1 type index, 0 imm indices
+# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
+# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
# DEBUG-NEXT: G_FPOW (opcode {{[0-9]+}}): 1 type index, 0 imm indices
# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
# DEBUG-NEXT: .. the first uncovered type index: 1, OK
@@ -607,11 +610,11 @@
# DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
# DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
# DEBUG-NEXT: G_FMINIMUMNUM (opcode {{[0-9]+}}): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. opcode 219 is aliased to 183
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
# DEBUG-NEXT: .. the first uncovered type index: 1, OK
# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_FMAXIMUMNUM (opcode {{[0-9]+}}): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. opcode 220 is aliased to 183
+# DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
# DEBUG-NEXT: .. the first uncovered type index: 1, OK
# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
# DEBUG-NEXT: G_GET_FPENV (opcode {{[0-9]+}}): 1 type index, 0 imm indices
diff --git a/llvm/test/CodeGen/RISCV/rvv/remat.ll b/llvm/test/CodeGen/RISCV/rvv/remat.ll
index 06d54fa..95bff27 100644
--- a/llvm/test/CodeGen/RISCV/rvv/remat.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/remat.ll
@@ -301,3 +301,135 @@ define void @vfmv.s.f(ptr %p, double %x) {
store volatile double %x, ptr %p
ret void
}
+
+; This test is fairly fragile, but it's trying to cover the case which
+; caused the revert of bba9172 due to interaction with how rematerialize
+; instructions are pruned from the original live interval. In the result
+; below, we remat the vmv.v.x into the loop, but fail to remat the vmv.v.x
+; a second time after further splitting it's live range. We shouldn't need
+; to spill it to the stack at all.
+define i64 @dual_remat(i64 %0, <vscale x 16 x i64> %1, <vscale x 16 x i64> %2, ptr %p) #0 {
+; CHECK-LABEL: dual_remat:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: addi sp, sp, -16
+; CHECK-NEXT: .cfi_def_cfa_offset 16
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 5
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: sub sp, sp, a1
+; CHECK-NEXT: .cfi_escape 0x0f, 0x0d, 0x72, 0x00, 0x11, 0x10, 0x22, 0x11, 0x21, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 16 + 33 * vlenb
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a1, a1, 3
+; CHECK-NEXT: add a1, sp, a1
+; CHECK-NEXT: addi a1, a1, 16
+; CHECK-NEXT: vs8r.v v16, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: addi a1, sp, 16
+; CHECK-NEXT: vs8r.v v8, (a1) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vsetvli a1, zero, e64, m8, ta, ma
+; CHECK-NEXT: vmv.v.i v16, 0
+; CHECK-NEXT: csrr a2, vlenb
+; CHECK-NEXT: srli a1, a2, 3
+; CHECK-NEXT: slli a2, a2, 3
+; CHECK-NEXT: add a2, a3, a2
+; CHECK-NEXT: vmv.v.i v0, 0
+; CHECK-NEXT: .LBB8_1: # %vector.body
+; CHECK-NEXT: # =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: mv a5, a4
+; CHECK-NEXT: slli a4, a4, 3
+; CHECK-NEXT: add a5, a5, a4
+; CHECK-NEXT: slli a4, a4, 1
+; CHECK-NEXT: add a4, a4, a5
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vs8r.v v16, (a4) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: vmv.v.x v8, a0
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: slli a5, a4, 4
+; CHECK-NEXT: add a4, a5, a4
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vs8r.v v8, (a4) # vscale x 64-byte Folded Spill
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: mv a5, a4
+; CHECK-NEXT: slli a4, a4, 3
+; CHECK-NEXT: add a5, a5, a4
+; CHECK-NEXT: slli a4, a4, 1
+; CHECK-NEXT: add a4, a4, a5
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vand.vv v16, v16, v8
+; CHECK-NEXT: vmsne.vi v24, v16, 0
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: slli a4, a4, 4
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vs1r.v v24, (a4) # vscale x 8-byte Folded Spill
+; CHECK-NEXT: vand.vv v16, v0, v8
+; CHECK-NEXT: vmsne.vi v8, v16, 0
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: mv a5, a4
+; CHECK-NEXT: slli a4, a4, 3
+; CHECK-NEXT: add a5, a5, a4
+; CHECK-NEXT: slli a4, a4, 1
+; CHECK-NEXT: add a4, a4, a5
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vl8r.v v16, (a4) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: csrr a4, vlenb
+; CHECK-NEXT: slli a4, a4, 4
+; CHECK-NEXT: add a4, sp, a4
+; CHECK-NEXT: addi a4, a4, 16
+; CHECK-NEXT: vl1r.v v9, (a4) # vscale x 8-byte Folded Reload
+; CHECK-NEXT: vsetvli a4, zero, e8, mf4, ta, ma
+; CHECK-NEXT: vslideup.vx v9, v8, a1
+; CHECK-NEXT: vsetvli a4, zero, e8, m2, ta, ma
+; CHECK-NEXT: vcpop.m a4, v9
+; CHECK-NEXT: csrr a5, vlenb
+; CHECK-NEXT: slli a6, a5, 4
+; CHECK-NEXT: add a5, a6, a5
+; CHECK-NEXT: add a5, sp, a5
+; CHECK-NEXT: addi a5, a5, 16
+; CHECK-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vs8r.v v8, (a3)
+; CHECK-NEXT: vs8r.v v8, (a2)
+; CHECK-NEXT: addi a5, sp, 16
+; CHECK-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vsetvli a5, zero, e64, m8, ta, ma
+; CHECK-NEXT: vor.vv v16, v16, v8
+; CHECK-NEXT: csrr a5, vlenb
+; CHECK-NEXT: slli a5, a5, 3
+; CHECK-NEXT: add a5, sp, a5
+; CHECK-NEXT: addi a5, a5, 16
+; CHECK-NEXT: vl8r.v v8, (a5) # vscale x 64-byte Folded Reload
+; CHECK-NEXT: vor.vv v0, v0, v8
+; CHECK-NEXT: beqz a4, .LBB8_1
+; CHECK-NEXT: # %bb.2: # %middle.block
+; CHECK-NEXT: andi a0, a0, 1
+; CHECK-NEXT: csrr a1, vlenb
+; CHECK-NEXT: slli a2, a1, 5
+; CHECK-NEXT: add a1, a2, a1
+; CHECK-NEXT: add sp, sp, a1
+; CHECK-NEXT: .cfi_def_cfa sp, 16
+; CHECK-NEXT: addi sp, sp, 16
+; CHECK-NEXT: .cfi_def_cfa_offset 0
+; CHECK-NEXT: ret
+entry:
+ %broadcast.splatinsert = insertelement <vscale x 16 x i64> zeroinitializer, i64 %0, i64 0
+ %broadcast.splat = shufflevector <vscale x 16 x i64> %broadcast.splatinsert, <vscale x 16 x i64> zeroinitializer, <vscale x 16 x i32> zeroinitializer
+ br label %vector.body
+
+vector.body: ; preds = %vector.body, %entry
+ %vec.ind = phi <vscale x 16 x i64> [ zeroinitializer, %entry ], [ %vec.ind.next, %vector.body ]
+ %3 = and <vscale x 16 x i64> %vec.ind, %broadcast.splat
+ %4 = icmp ne <vscale x 16 x i64> %3, zeroinitializer
+ store <vscale x 16 x i64> %broadcast.splat, ptr %p
+ %5 = tail call i1 @llvm.vector.reduce.or.nxv16i1(<vscale x 16 x i1> %4)
+ %vec.ind.next = or <vscale x 16 x i64> %vec.ind, %1
+ br i1 %5, label %middle.block, label %vector.body
+
+middle.block: ; preds = %vector.body
+ %and.i = and i64 1, %0
+ ret i64 %and.i
+}
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/ImplicitBinding.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/ImplicitBinding.ll
index cd52498..2964da9 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-resources/ImplicitBinding.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/ImplicitBinding.ll
@@ -32,6 +32,7 @@
; CHECK-DAG: OpDecorate [[g]] Binding 0
; CHECK-DAG: OpDecorate [[h]] DescriptorSet 10
; CHECK-DAG: OpDecorate [[h]] Binding 3
+; CHECK-NOT: OpDecorate [[h]] Binding 4
; CHECK-DAG: OpDecorate [[i]] DescriptorSet 10
; CHECK-DAG: OpDecorate [[i]] Binding 2
@@ -44,30 +45,34 @@ entry:
%3 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 0, i32 2, i32 1, i32 0, ptr nonnull @.str.6)
%4 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 10, i32 1, i32 1, i32 0, ptr nonnull @.str.8)
%5 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 2, i32 10, i32 1, i32 0, ptr nonnull @.str.10)
- %6 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 3, i32 10, i32 1, i32 0, ptr nonnull @.str.12)
- %7 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 10, i32 2, i32 1, i32 0, ptr nonnull @.str.14)
- %8 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %1, i32 0)
- %9 = load i32, ptr addrspace(11) %8, align 4
- %10 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %2, i32 0)
- %11 = load i32, ptr addrspace(11) %10, align 4
- %add.i = add nsw i32 %11, %9
- %12 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %3, i32 0)
- %13 = load i32, ptr addrspace(11) %12, align 4
- %add4.i = add nsw i32 %add.i, %13
- %14 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %4, i32 0)
- %15 = load i32, ptr addrspace(11) %14, align 4
- %add6.i = add nsw i32 %add4.i, %15
- %16 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %5, i32 0)
- %17 = load i32, ptr addrspace(11) %16, align 4
- %add8.i = add nsw i32 %add6.i, %17
- %18 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %6, i32 0)
- %19 = load i32, ptr addrspace(11) %18, align 4
- %add10.i = add nsw i32 %add8.i, %19
- %20 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %7, i32 0)
- %21 = load i32, ptr addrspace(11) %20, align 4
- %add12.i = add nsw i32 %add10.i, %21
- %22 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %0, i32 0)
- store i32 %add12.i, ptr addrspace(11) %22, align 4
+ %6 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 3, i32 10, i32 2, i32 0, ptr nonnull @.str.12)
+ %7 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 3, i32 10, i32 2, i32 1, ptr nonnull @.str.12)
+ %8 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefrombinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 10, i32 2, i32 1, i32 0, ptr nonnull @.str.14)
+ %9 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %1, i32 0)
+ %10 = load i32, ptr addrspace(11) %9, align 4
+ %11 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %2, i32 0)
+ %12 = load i32, ptr addrspace(11) %11, align 4
+ %add.i = add nsw i32 %12, %10
+ %13 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %3, i32 0)
+ %14 = load i32, ptr addrspace(11) %13, align 4
+ %add4.i = add nsw i32 %add.i, %14
+ %15 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %4, i32 0)
+ %16 = load i32, ptr addrspace(11) %15, align 4
+ %add6.i = add nsw i32 %add4.i, %16
+ %17 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %5, i32 0)
+ %18 = load i32, ptr addrspace(11) %17, align 4
+ %add8.i = add nsw i32 %add6.i, %18
+ %19 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %6, i32 0)
+ %20 = load i32, ptr addrspace(11) %19, align 4
+ %add10.i = add nsw i32 %add8.i, %20
+ %21 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %7, i32 0)
+ %22 = load i32, ptr addrspace(11) %21, align 4
+ %add12.i = add nsw i32 %add10.i, %22
+ %23 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %8, i32 0)
+ %24 = load i32, ptr addrspace(11) %23, align 4
+ %add14.i = add nsw i32 %add12.i, %24
+ %25 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %0, i32 0)
+ store i32 %add14.i, ptr addrspace(11) %25, align 4
ret void
}
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-resources/UniqueImplicitBindingNumber.ll b/llvm/test/CodeGen/SPIRV/hlsl-resources/UniqueImplicitBindingNumber.ll
new file mode 100644
index 0000000..c968c99
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-resources/UniqueImplicitBindingNumber.ll
@@ -0,0 +1,19 @@
+; RUN: not llc -O0 -mtriple=spirv32-unknown-unknown %s -o %t.spvt 2>&1 | FileCheck %s --check-prefix=CHECK-ERROR
+; CHECK-ERROR: LLVM ERROR: Implicit binding calls with the same order ID must have the same descriptor set
+
+@.str = private unnamed_addr constant [2 x i8] c"b\00", align 1
+@.str.2 = private unnamed_addr constant [2 x i8] c"c\00", align 1
+
+define void @main() local_unnamed_addr #0 {
+entry:
+ %0 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 0, i32 0, i32 1, i32 0, ptr nonnull @.str)
+ %1 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %0, i32 0)
+ %2 = load i32, ptr addrspace(11) %1, align 4
+ %3 = tail call target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) @llvm.spv.resource.handlefromimplicitbinding.tspirv.SignedImage_i32_5_2_0_0_2_0t(i32 0, i32 1, i32 1, i32 0, ptr nonnull @.str.2)
+ %4 = tail call noundef align 4 dereferenceable(4) ptr addrspace(11) @llvm.spv.resource.getpointer.p11.tspirv.SignedImage_i32_5_2_0_0_2_0t(target("spirv.SignedImage", i32, 5, 2, 0, 0, 2, 0) %3, i32 0)
+ store i32 %2, ptr addrspace(11) %4, align 4
+ ret void
+}
+
+
+attributes #0 = { "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
diff --git a/llvm/test/CodeGen/SystemZ/fp-cmp-04.ll b/llvm/test/CodeGen/SystemZ/fp-cmp-04.ll
index d3d6413..eb7c1b6 100644
--- a/llvm/test/CodeGen/SystemZ/fp-cmp-04.ll
+++ b/llvm/test/CodeGen/SystemZ/fp-cmp-04.ll
@@ -235,7 +235,7 @@ define half @f12_half(half %dummy, half %val, ptr %dest) {
; CHECK-NEXT: blah %f0
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT
-; CHECK-NEXT: ltebr %f0, %f0
+; CHECK-NEXT: ltebr %f1, %f0
; CHECK-NEXT: jl .LBB11_2
; CHECK-NEXT:# %bb.1:
; CHECK-NEXT: lgdr %r0, %f8
@@ -344,7 +344,7 @@ define half @f15_half(half %val, half %dummy, ptr %dest) {
; CHECK-NEXT: blah %f2
; CHECK-NEXT: #NO_APP
; CHECK-NEXT: brasl %r14, __extendhfsf2@PLT
-; CHECK-NEXT: ltebr %f0, %f0
+; CHECK-NEXT: ltebr %f1, %f0
; CHECK-NEXT: jl .LBB15_2
; CHECK-NEXT:# %bb.1:
; CHECK-NEXT: lgdr %r0, %f8
diff --git a/llvm/test/CodeGen/VE/Vector/vec_divrem.ll b/llvm/test/CodeGen/VE/Vector/vec_divrem.ll
index 3bc0aba..93e2889 100644
--- a/llvm/test/CodeGen/VE/Vector/vec_divrem.ll
+++ b/llvm/test/CodeGen/VE/Vector/vec_divrem.ll
@@ -7,19 +7,22 @@
define <4 x i8> @udiv_by_minus_one(<4 x i8> %x) {
; CHECK-LABEL: udiv_by_minus_one:
; CHECK: # %bb.0:
-; CHECK-NEXT: and %s0, %s0, (56)0
-; CHECK-NEXT: lea %s4, 16843010
-; CHECK-NEXT: muls.l %s0, %s0, %s4
-; CHECK-NEXT: srl %s0, %s0, 32
+; CHECK-NEXT: and %s4, %s0, (56)0
; CHECK-NEXT: and %s1, %s1, (56)0
-; CHECK-NEXT: muls.l %s1, %s1, %s4
-; CHECK-NEXT: srl %s1, %s1, 32
; CHECK-NEXT: and %s2, %s2, (56)0
-; CHECK-NEXT: muls.l %s2, %s2, %s4
-; CHECK-NEXT: srl %s2, %s2, 32
; CHECK-NEXT: and %s3, %s3, (56)0
-; CHECK-NEXT: muls.l %s3, %s3, %s4
-; CHECK-NEXT: srl %s3, %s3, 32
+; CHECK-NEXT: or %s0, 0, (0)1
+; CHECK-NEXT: cmpu.w %s5, %s3, (56)0
+; CHECK-NEXT: or %s3, 0, (0)1
+; CHECK-NEXT: cmov.w.eq %s3, (63)0, %s5
+; CHECK-NEXT: cmpu.w %s5, %s2, (56)0
+; CHECK-NEXT: or %s2, 0, (0)1
+; CHECK-NEXT: cmov.w.eq %s2, (63)0, %s5
+; CHECK-NEXT: cmpu.w %s5, %s1, (56)0
+; CHECK-NEXT: or %s1, 0, (0)1
+; CHECK-NEXT: cmov.w.eq %s1, (63)0, %s5
+; CHECK-NEXT: cmpu.w %s4, %s4, (56)0
+; CHECK-NEXT: cmov.w.eq %s0, (63)0, %s4
; CHECK-NEXT: b.l.t (, %s10)
%r = udiv <4 x i8> %x, <i8 255, i8 255, i8 255, i8 255>
ret <4 x i8> %r
@@ -28,27 +31,18 @@ define <4 x i8> @udiv_by_minus_one(<4 x i8> %x) {
define <4 x i8> @urem_by_minus_one(<4 x i8> %x) {
; CHECK-LABEL: urem_by_minus_one:
; CHECK: # %bb.0:
-; CHECK-NEXT: and %s0, %s0, (56)0
-; CHECK-NEXT: and %s1, %s1, (56)0
-; CHECK-NEXT: and %s2, %s2, (56)0
-; CHECK-NEXT: and %s3, %s3, (56)0
-; CHECK-NEXT: lea %s4, 16843010
-; CHECK-NEXT: muls.l %s5, %s3, %s4
-; CHECK-NEXT: srl %s5, %s5, 32
-; CHECK-NEXT: muls.w.sx %s5, %s5, (56)0
-; CHECK-NEXT: subs.w.sx %s3, %s3, %s5
-; CHECK-NEXT: muls.l %s5, %s2, %s4
-; CHECK-NEXT: srl %s5, %s5, 32
-; CHECK-NEXT: muls.w.sx %s5, %s5, (56)0
-; CHECK-NEXT: subs.w.sx %s2, %s2, %s5
-; CHECK-NEXT: muls.l %s5, %s1, %s4
-; CHECK-NEXT: srl %s5, %s5, 32
-; CHECK-NEXT: muls.w.sx %s5, %s5, (56)0
-; CHECK-NEXT: subs.w.sx %s1, %s1, %s5
-; CHECK-NEXT: muls.l %s4, %s0, %s4
-; CHECK-NEXT: srl %s4, %s4, 32
-; CHECK-NEXT: muls.w.sx %s4, %s4, (56)0
-; CHECK-NEXT: subs.w.sx %s0, %s0, %s4
+; CHECK-NEXT: and %s4, %s0, (56)0
+; CHECK-NEXT: and %s5, %s1, (56)0
+; CHECK-NEXT: and %s6, %s2, (56)0
+; CHECK-NEXT: and %s7, %s3, (56)0
+; CHECK-NEXT: cmpu.w %s7, %s7, (56)0
+; CHECK-NEXT: cmov.w.eq %s3, (0)1, %s7
+; CHECK-NEXT: cmpu.w %s6, %s6, (56)0
+; CHECK-NEXT: cmov.w.eq %s2, (0)1, %s6
+; CHECK-NEXT: cmpu.w %s5, %s5, (56)0
+; CHECK-NEXT: cmov.w.eq %s1, (0)1, %s5
+; CHECK-NEXT: cmpu.w %s4, %s4, (56)0
+; CHECK-NEXT: cmov.w.eq %s0, (0)1, %s4
; CHECK-NEXT: b.l.t (, %s10)
%r = urem <4 x i8> %x, <i8 255, i8 255, i8 255, i8 255>
ret <4 x i8> %r
diff --git a/llvm/test/CodeGen/X86/fshl.ll b/llvm/test/CodeGen/X86/fshl.ll
index ec1b8a3..f998128 100644
--- a/llvm/test/CodeGen/X86/fshl.ll
+++ b/llvm/test/CodeGen/X86/fshl.ll
@@ -335,84 +335,83 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
; X86-SLOW-NEXT: pushl %esi
; X86-SLOW-NEXT: andl $-16, %esp
; X86-SLOW-NEXT: subl $32, %esp
-; X86-SLOW-NEXT: movl 24(%ebp), %esi
+; X86-SLOW-NEXT: movl 24(%ebp), %edi
; X86-SLOW-NEXT: movl 28(%ebp), %eax
; X86-SLOW-NEXT: movl 48(%ebp), %edx
; X86-SLOW-NEXT: movl 56(%ebp), %ecx
; X86-SLOW-NEXT: testb $64, %cl
-; X86-SLOW-NEXT: movl 52(%ebp), %edi
+; X86-SLOW-NEXT: movl 52(%ebp), %ebx
; X86-SLOW-NEXT: jne .LBB6_1
; X86-SLOW-NEXT: # %bb.2:
; X86-SLOW-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT: movl %esi, %edx
-; X86-SLOW-NEXT: movl 32(%ebp), %esi
-; X86-SLOW-NEXT: movl %edi, %ecx
-; X86-SLOW-NEXT: movl %eax, %edi
+; X86-SLOW-NEXT: movl %edi, %edx
+; X86-SLOW-NEXT: movl 32(%ebp), %edi
+; X86-SLOW-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT: movl %eax, %ebx
; X86-SLOW-NEXT: movl 36(%ebp), %eax
; X86-SLOW-NEXT: jmp .LBB6_3
; X86-SLOW-NEXT: .LBB6_1:
; X86-SLOW-NEXT: movl 40(%ebp), %ecx
; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SLOW-NEXT: movl 44(%ebp), %ecx
+; X86-SLOW-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SLOW-NEXT: .LBB6_3:
-; X86-SLOW-NEXT: movl 56(%ebp), %ebx
-; X86-SLOW-NEXT: testb $32, %bl
+; X86-SLOW-NEXT: movl 56(%ebp), %ecx
+; X86-SLOW-NEXT: testb $32, %cl
; X86-SLOW-NEXT: jne .LBB6_4
; X86-SLOW-NEXT: # %bb.5:
-; X86-SLOW-NEXT: movl %ecx, %ebx
; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT: movl %edx, %edi
+; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SLOW-NEXT: jmp .LBB6_6
; X86-SLOW-NEXT: .LBB6_4:
-; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT: movl %ecx, %edx
-; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-SLOW-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT: movl %edx, %ebx
+; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-SLOW-NEXT: .LBB6_6:
-; X86-SLOW-NEXT: movl %edx, %esi
+; X86-SLOW-NEXT: movl %edi, %eax
+; X86-SLOW-NEXT: shll %cl, %eax
+; X86-SLOW-NEXT: shrl %esi
+; X86-SLOW-NEXT: movl %ecx, %edx
+; X86-SLOW-NEXT: notb %dl
+; X86-SLOW-NEXT: movl %edx, %ecx
+; X86-SLOW-NEXT: shrl %cl, %esi
+; X86-SLOW-NEXT: orl %eax, %esi
+; X86-SLOW-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT: movl %ebx, %eax
; X86-SLOW-NEXT: movl 56(%ebp), %ecx
-; X86-SLOW-NEXT: shll %cl, %esi
-; X86-SLOW-NEXT: movl %ebx, %edi
+; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT: shll %cl, %eax
; X86-SLOW-NEXT: shrl %edi
-; X86-SLOW-NEXT: movl %ecx, %ebx
-; X86-SLOW-NEXT: notb %bl
-; X86-SLOW-NEXT: movl %ebx, %ecx
-; X86-SLOW-NEXT: movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: shrl %cl, %edi
-; X86-SLOW-NEXT: orl %esi, %edi
+; X86-SLOW-NEXT: orl %eax, %edi
; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-SLOW-NEXT: movl %esi, %eax
; X86-SLOW-NEXT: movl 56(%ebp), %ecx
; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-SLOW-NEXT: shll %cl, %eax
-; X86-SLOW-NEXT: shrl %edx
-; X86-SLOW-NEXT: movl %ebx, %ecx
-; X86-SLOW-NEXT: shrl %cl, %edx
-; X86-SLOW-NEXT: orl %eax, %edx
-; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-SLOW-NEXT: movl %ebx, %eax
+; X86-SLOW-NEXT: shrl %ebx
+; X86-SLOW-NEXT: movl %edx, %ecx
+; X86-SLOW-NEXT: shrl %cl, %ebx
+; X86-SLOW-NEXT: orl %eax, %ebx
; X86-SLOW-NEXT: movl 56(%ebp), %ecx
; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-SLOW-NEXT: shll %cl, %eax
; X86-SLOW-NEXT: shrl %esi
-; X86-SLOW-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: shrl %cl, %esi
; X86-SLOW-NEXT: orl %eax, %esi
-; X86-SLOW-NEXT: movl 56(%ebp), %ecx
-; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-SLOW-NEXT: shll %cl, %eax
-; X86-SLOW-NEXT: shrl %ebx
-; X86-SLOW-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-SLOW-NEXT: shrl %cl, %ebx
-; X86-SLOW-NEXT: orl %eax, %ebx
; X86-SLOW-NEXT: movl 8(%ebp), %eax
-; X86-SLOW-NEXT: movl %ebx, 12(%eax)
-; X86-SLOW-NEXT: movl %esi, 8(%eax)
-; X86-SLOW-NEXT: movl %edx, 4(%eax)
-; X86-SLOW-NEXT: movl %edi, (%eax)
+; X86-SLOW-NEXT: movl %esi, 12(%eax)
+; X86-SLOW-NEXT: movl %ebx, 8(%eax)
+; X86-SLOW-NEXT: movl %edi, 4(%eax)
+; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-SLOW-NEXT: movl %ecx, (%eax)
; X86-SLOW-NEXT: leal -12(%ebp), %esp
; X86-SLOW-NEXT: popl %esi
; X86-SLOW-NEXT: popl %edi
diff --git a/llvm/test/CodeGen/X86/fshr.ll b/llvm/test/CodeGen/X86/fshr.ll
index 544ab7f..c307833 100644
--- a/llvm/test/CodeGen/X86/fshr.ll
+++ b/llvm/test/CodeGen/X86/fshr.ll
@@ -322,79 +322,79 @@ define i128 @var_shift_i128(i128 %x, i128 %y, i128 %z) nounwind {
; X86-SLOW-NEXT: subl $16, %esp
; X86-SLOW-NEXT: movl 24(%ebp), %edx
; X86-SLOW-NEXT: movl 28(%ebp), %esi
-; X86-SLOW-NEXT: movl 48(%ebp), %ebx
+; X86-SLOW-NEXT: movl 48(%ebp), %edi
; X86-SLOW-NEXT: movl 56(%ebp), %eax
; X86-SLOW-NEXT: testb $64, %al
-; X86-SLOW-NEXT: movl 52(%ebp), %edi
+; X86-SLOW-NEXT: movl 52(%ebp), %eax
; X86-SLOW-NEXT: je .LBB6_1
; X86-SLOW-NEXT: # %bb.2:
-; X86-SLOW-NEXT: movl %ebx, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT: movl %edx, %ebx
+; X86-SLOW-NEXT: movl %edi, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT: movl %edx, %edi
; X86-SLOW-NEXT: movl 32(%ebp), %edx
-; X86-SLOW-NEXT: movl %edi, %eax
-; X86-SLOW-NEXT: movl %esi, %edi
+; X86-SLOW-NEXT: movl %eax, %ecx
+; X86-SLOW-NEXT: movl %esi, %eax
; X86-SLOW-NEXT: movl 36(%ebp), %esi
; X86-SLOW-NEXT: jmp .LBB6_3
; X86-SLOW-NEXT: .LBB6_1:
-; X86-SLOW-NEXT: movl 40(%ebp), %eax
-; X86-SLOW-NEXT: movl %eax, (%esp) # 4-byte Spill
-; X86-SLOW-NEXT: movl 44(%ebp), %eax
+; X86-SLOW-NEXT: movl 40(%ebp), %ecx
+; X86-SLOW-NEXT: movl %ecx, (%esp) # 4-byte Spill
+; X86-SLOW-NEXT: movl 44(%ebp), %ecx
; X86-SLOW-NEXT: .LBB6_3:
-; X86-SLOW-NEXT: movl 56(%ebp), %ecx
-; X86-SLOW-NEXT: testb $32, %cl
+; X86-SLOW-NEXT: movl 56(%ebp), %ebx
+; X86-SLOW-NEXT: testb $32, %bl
; X86-SLOW-NEXT: je .LBB6_4
; X86-SLOW-NEXT: # %bb.5:
-; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SLOW-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT: movl %ecx, %ebx
; X86-SLOW-NEXT: jmp .LBB6_6
; X86-SLOW-NEXT: .LBB6_4:
; X86-SLOW-NEXT: movl %edx, %esi
+; X86-SLOW-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SLOW-NEXT: movl %eax, %ebx
-; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload
+; X86-SLOW-NEXT: movl %ecx, %edi
+; X86-SLOW-NEXT: movl (%esp), %ebx # 4-byte Reload
; X86-SLOW-NEXT: .LBB6_6:
-; X86-SLOW-NEXT: shrl %cl, %eax
-; X86-SLOW-NEXT: movl %eax, %edx
-; X86-SLOW-NEXT: movl %ecx, %eax
-; X86-SLOW-NEXT: notb %al
-; X86-SLOW-NEXT: movl %ebx, %edi
-; X86-SLOW-NEXT: addl %ebx, %ebx
-; X86-SLOW-NEXT: movl %eax, %ecx
-; X86-SLOW-NEXT: shll %cl, %ebx
-; X86-SLOW-NEXT: orl %edx, %ebx
-; X86-SLOW-NEXT: movl %ebx, (%esp) # 4-byte Spill
; X86-SLOW-NEXT: movl 56(%ebp), %ecx
-; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-SLOW-NEXT: shrl %cl, %edi
-; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-SLOW-NEXT: leal (%ebx,%ebx), %edx
-; X86-SLOW-NEXT: movl %eax, %ecx
-; X86-SLOW-NEXT: shll %cl, %edx
-; X86-SLOW-NEXT: orl %edi, %edx
+; X86-SLOW-NEXT: shrl %cl, %ebx
+; X86-SLOW-NEXT: movl %ecx, %edx
+; X86-SLOW-NEXT: notb %dl
+; X86-SLOW-NEXT: movl %edi, %eax
+; X86-SLOW-NEXT: addl %edi, %edi
+; X86-SLOW-NEXT: movl %edx, %ecx
+; X86-SLOW-NEXT: shll %cl, %edi
+; X86-SLOW-NEXT: orl %ebx, %edi
+; X86-SLOW-NEXT: movl %edi, (%esp) # 4-byte Spill
; X86-SLOW-NEXT: movl 56(%ebp), %ecx
; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-SLOW-NEXT: shrl %cl, %ebx
-; X86-SLOW-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT: shrl %cl, %eax
; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-SLOW-NEXT: leal (%edi,%edi), %ebx
-; X86-SLOW-NEXT: movl %eax, %ecx
+; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: shll %cl, %ebx
-; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-SLOW-NEXT: orl %eax, %ebx
; X86-SLOW-NEXT: movl 56(%ebp), %ecx
; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
; X86-SLOW-NEXT: shrl %cl, %edi
+; X86-SLOW-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SLOW-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SLOW-NEXT: leal (%eax,%eax), %edi
+; X86-SLOW-NEXT: movl %edx, %ecx
+; X86-SLOW-NEXT: shll %cl, %edi
+; X86-SLOW-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-SLOW-NEXT: movl 56(%ebp), %ecx
+; X86-SLOW-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-SLOW-NEXT: shrl %cl, %eax
; X86-SLOW-NEXT: addl %esi, %esi
-; X86-SLOW-NEXT: movl %eax, %ecx
+; X86-SLOW-NEXT: movl %edx, %ecx
; X86-SLOW-NEXT: shll %cl, %esi
-; X86-SLOW-NEXT: orl %edi, %esi
-; X86-SLOW-NEXT: movl 8(%ebp), %ecx
-; X86-SLOW-NEXT: movl %esi, 12(%ecx)
-; X86-SLOW-NEXT: movl %ebx, 8(%ecx)
-; X86-SLOW-NEXT: movl %edx, 4(%ecx)
-; X86-SLOW-NEXT: movl (%esp), %eax # 4-byte Reload
-; X86-SLOW-NEXT: movl %eax, (%ecx)
-; X86-SLOW-NEXT: movl %ecx, %eax
+; X86-SLOW-NEXT: orl %eax, %esi
+; X86-SLOW-NEXT: movl 8(%ebp), %eax
+; X86-SLOW-NEXT: movl %esi, 12(%eax)
+; X86-SLOW-NEXT: movl %edi, 8(%eax)
+; X86-SLOW-NEXT: movl %ebx, 4(%eax)
+; X86-SLOW-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-SLOW-NEXT: movl %ecx, (%eax)
; X86-SLOW-NEXT: leal -12(%ebp), %esp
; X86-SLOW-NEXT: popl %esi
; X86-SLOW-NEXT: popl %edi
diff --git a/llvm/test/CodeGen/X86/sbb.ll b/llvm/test/CodeGen/X86/sbb.ll
index 78d609d..f5a3468 100644
--- a/llvm/test/CodeGen/X86/sbb.ll
+++ b/llvm/test/CodeGen/X86/sbb.ll
@@ -365,3 +365,32 @@ define i32 @uge_sext_add(i32 %0, i32 %1, i32 %2) {
%6 = add nsw i32 %5, %0
ret i32 %6
}
+
+define i32 @sub_sub_ugt(i32 %a, i32 %b) {
+; CHECK-LABEL: sub_sub_ugt:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: cmpl %edi, %esi
+; CHECK-NEXT: sbbl %esi, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp ugt i32 %a, %b
+ %conv = zext i1 %cmp to i32
+ %sub = sub i32 %a, %b
+ %res = sub i32 %sub, %conv
+ ret i32 %res
+}
+
+define i32 @sub_sub_ult(i32 %a, i32 %b) {
+; CHECK-LABEL: sub_sub_ult:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: cmpl %edi, %esi
+; CHECK-NEXT: sbbl %esi, %eax
+; CHECK-NEXT: retq
+ %cmp = icmp ult i32 %b, %a
+ %conv = zext i1 %cmp to i32
+ %sub = sub i32 %a, %b
+ %res = sub i32 %sub, %conv
+ ret i32 %res
+}
+
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 7462c77..049ee47 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -613,8 +613,7 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
; i686-NEXT: shldl %cl, %esi, %ebx
; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; i686-NEXT: movl %edi, %esi
-; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT: movl %eax, %ecx
+; i686-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; i686-NEXT: shll %cl, %esi
; i686-NEXT: shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; i686-NEXT: negl %edx