aboutsummaryrefslogtreecommitdiff
path: root/llvm/test
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test')
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll812
-rw-r--r--llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll12
-rw-r--r--llvm/test/CodeGen/AArch64/machine-combiner-copy.ll2
-rw-r--r--llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll47
-rw-r--r--llvm/test/CodeGen/AArch64/peephole-and-tst.ll4
-rw-r--r--llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll6
-rw-r--r--llvm/test/CodeGen/AArch64/tbl-loops.ll8
-rw-r--r--llvm/test/CodeGen/AArch64/trampoline.ll6
-rw-r--r--llvm/test/CodeGen/ARM/combine-movc-sub.ll12
-rw-r--r--llvm/test/CodeGen/ARM/extract-bits.ll148
-rw-r--r--llvm/test/CodeGen/ARM/extract-lowbits.ll92
-rw-r--r--llvm/test/CodeGen/ARM/llround-conv.ll74
-rw-r--r--llvm/test/CodeGen/ARM/lround-conv.ll46
-rw-r--r--llvm/test/CodeGen/RISCV/pr69586.ll204
-rw-r--r--llvm/test/CodeGen/RISCV/rv64-trampoline.ll7
-rw-r--r--llvm/test/CodeGen/SystemZ/llvm.sincos.ll4
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll6
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll151
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll80
-rw-r--r--llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll13
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-float16regloops.ll82
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-float32regloops.ll100
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-gather-increment.ll278
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-phireg.ll30
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll519
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll22
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll10
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll16
-rw-r--r--llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll92
-rw-r--r--llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll82
-rw-r--r--llvm/test/CodeGen/X86/dag-update-nodetomatch.ll5
-rw-r--r--llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir4
-rw-r--r--llvm/test/CodeGen/X86/inalloca-invoke.ll2
-rw-r--r--llvm/test/CodeGen/X86/licm-regpressure.ll62
34 files changed, 1549 insertions, 1489 deletions
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
index ed68723..41f7ab8 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
@@ -1219,14 +1219,14 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
;
; GISEL-LABEL: test_shl_i1024:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: sub sp, sp, #416
-; GISEL-NEXT: stp x28, x27, [sp, #320] ; 16-byte Folded Spill
-; GISEL-NEXT: stp x26, x25, [sp, #336] ; 16-byte Folded Spill
-; GISEL-NEXT: stp x24, x23, [sp, #352] ; 16-byte Folded Spill
-; GISEL-NEXT: stp x22, x21, [sp, #368] ; 16-byte Folded Spill
-; GISEL-NEXT: stp x20, x19, [sp, #384] ; 16-byte Folded Spill
-; GISEL-NEXT: stp x29, x30, [sp, #400] ; 16-byte Folded Spill
-; GISEL-NEXT: .cfi_def_cfa_offset 416
+; GISEL-NEXT: sub sp, sp, #432
+; GISEL-NEXT: stp x28, x27, [sp, #336] ; 16-byte Folded Spill
+; GISEL-NEXT: stp x26, x25, [sp, #352] ; 16-byte Folded Spill
+; GISEL-NEXT: stp x24, x23, [sp, #368] ; 16-byte Folded Spill
+; GISEL-NEXT: stp x22, x21, [sp, #384] ; 16-byte Folded Spill
+; GISEL-NEXT: stp x20, x19, [sp, #400] ; 16-byte Folded Spill
+; GISEL-NEXT: stp x29, x30, [sp, #416] ; 16-byte Folded Spill
+; GISEL-NEXT: .cfi_def_cfa_offset 432
; GISEL-NEXT: .cfi_offset w30, -8
; GISEL-NEXT: .cfi_offset w29, -16
; GISEL-NEXT: .cfi_offset w19, -24
@@ -1242,38 +1242,44 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: ldp x10, x11, [x1]
; GISEL-NEXT: mov w8, w2
; GISEL-NEXT: lsr x9, x8, #6
-; GISEL-NEXT: and x16, x8, #0x3f
+; GISEL-NEXT: and x12, x8, #0x3f
+; GISEL-NEXT: str x0, [sp, #144] ; 8-byte Folded Spill
+; GISEL-NEXT: and x14, x8, #0x3f
; GISEL-NEXT: mov w13, #64 ; =0x40
-; GISEL-NEXT: sub x21, x13, x16
-; GISEL-NEXT: str x0, [sp, #112] ; 8-byte Folded Spill
-; GISEL-NEXT: mov x24, x16
-; GISEL-NEXT: lsl x25, x10, x16
+; GISEL-NEXT: and x16, x8, #0x3f
+; GISEL-NEXT: lsl x0, x10, x12
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: lsr x26, x10, x21
-; GISEL-NEXT: lsl x2, x11, x16
-; GISEL-NEXT: lsr x23, x11, x21
-; GISEL-NEXT: mov x22, x21
-; GISEL-NEXT: csel x12, x25, xzr, eq
+; GISEL-NEXT: sub x2, x13, x14
+; GISEL-NEXT: lsr x3, x10, x2
+; GISEL-NEXT: lsl x6, x11, x14
+; GISEL-NEXT: and x14, x8, #0x3f
+; GISEL-NEXT: csel x12, x0, xzr, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: str x1, [sp, #312] ; 8-byte Folded Spill
+; GISEL-NEXT: lsr x20, x11, x2
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: str x23, [sp, #208] ; 8-byte Folded Spill
+; GISEL-NEXT: mov x24, x0
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #3
-; GISEL-NEXT: stp x24, x22, [sp, #40] ; 16-byte Folded Spill
+; GISEL-NEXT: mov x7, x3
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #4
+; GISEL-NEXT: mov x28, x1
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #5
+; GISEL-NEXT: and x21, x8, #0x3f
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #6
+; GISEL-NEXT: str x6, [sp, #24] ; 8-byte Folded Spill
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #7
+; GISEL-NEXT: str x28, [sp, #304] ; 8-byte Folded Spill
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #8
+; GISEL-NEXT: str x7, [sp, #272] ; 8-byte Folded Spill
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #9
+; GISEL-NEXT: str x20, [sp, #112] ; 8-byte Folded Spill
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #10
; GISEL-NEXT: csel x12, xzr, x12, eq
@@ -1290,13 +1296,13 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: cmp x8, #0
; GISEL-NEXT: csel x10, x10, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: str x10, [sp, #192] ; 8-byte Folded Spill
-; GISEL-NEXT: csel x10, xzr, x26, eq
+; GISEL-NEXT: str x10, [sp, #232] ; 8-byte Folded Spill
+; GISEL-NEXT: csel x10, xzr, x3, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: orr x10, x2, x10
+; GISEL-NEXT: orr x10, x6, x10
; GISEL-NEXT: csel x10, x10, xzr, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: csel x10, x25, x10, eq
+; GISEL-NEXT: csel x10, x0, x10, eq
; GISEL-NEXT: cmp x9, #2
; GISEL-NEXT: csel x10, xzr, x10, eq
; GISEL-NEXT: cmp x9, #3
@@ -1327,25 +1333,24 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: cmp x9, #15
; GISEL-NEXT: csel x13, xzr, x13, eq
; GISEL-NEXT: cmp x8, #0
-; GISEL-NEXT: lsl x20, x12, x16
+; GISEL-NEXT: lsl x26, x12, x14
; GISEL-NEXT: csel x11, x11, x13, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: str x11, [sp, #184] ; 8-byte Folded Spill
-; GISEL-NEXT: csel x11, xzr, x23, eq
+; GISEL-NEXT: str x11, [sp, #224] ; 8-byte Folded Spill
+; GISEL-NEXT: csel x11, xzr, x20, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: orr x11, x20, x11
-; GISEL-NEXT: lsr x15, x12, x21
-; GISEL-NEXT: lsl x14, x10, x16
+; GISEL-NEXT: orr x11, x26, x11
+; GISEL-NEXT: lsr x15, x12, x2
+; GISEL-NEXT: lsl x30, x10, x16
; GISEL-NEXT: csel x11, x11, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: lsr x17, x10, x21
-; GISEL-NEXT: csel x13, xzr, x26, eq
+; GISEL-NEXT: lsr x17, x10, x2
+; GISEL-NEXT: csel x13, xzr, x3, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: str x20, [sp, #8] ; 8-byte Folded Spill
-; GISEL-NEXT: orr x13, x2, x13
+; GISEL-NEXT: orr x13, x6, x13
; GISEL-NEXT: csel x11, x13, x11, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: csel x11, x25, x11, eq
+; GISEL-NEXT: csel x11, x0, x11, eq
; GISEL-NEXT: cmp x9, #3
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #4
@@ -1375,23 +1380,23 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: cmp x8, #0
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: str x11, [sp, #176] ; 8-byte Folded Spill
+; GISEL-NEXT: str x11, [sp, #216] ; 8-byte Folded Spill
; GISEL-NEXT: csel x11, xzr, x15, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: orr x11, x14, x11
+; GISEL-NEXT: orr x11, x30, x11
; GISEL-NEXT: csel x11, x11, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x23, eq
+; GISEL-NEXT: csel x12, xzr, x20, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: orr x12, x20, x12
+; GISEL-NEXT: orr x12, x26, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x26, eq
+; GISEL-NEXT: csel x12, xzr, x3, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: orr x12, x2, x12
+; GISEL-NEXT: orr x12, x6, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: cmp x9, #3
-; GISEL-NEXT: csel x11, x25, x11, eq
+; GISEL-NEXT: csel x11, x0, x11, eq
; GISEL-NEXT: cmp x9, #4
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #5
@@ -1421,33 +1426,33 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: lsl x0, x12, x16
; GISEL-NEXT: csel x10, x10, x13, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: str x10, [sp, #168] ; 8-byte Folded Spill
+; GISEL-NEXT: str x10, [sp, #208] ; 8-byte Folded Spill
; GISEL-NEXT: csel x10, xzr, x17, eq
; GISEL-NEXT: cmp x9, #0
; GISEL-NEXT: orr x10, x0, x10
-; GISEL-NEXT: lsr x27, x12, x21
+; GISEL-NEXT: lsr x4, x12, x2
; GISEL-NEXT: lsl x19, x11, x16
; GISEL-NEXT: csel x10, x10, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: lsr x3, x11, x21
+; GISEL-NEXT: mov x16, x15
; GISEL-NEXT: csel x13, xzr, x15, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: stp x27, x0, [sp, #240] ; 16-byte Folded Spill
-; GISEL-NEXT: orr x13, x14, x13
-; GISEL-NEXT: mov x7, x3
+; GISEL-NEXT: str x4, [sp, #248] ; 8-byte Folded Spill
+; GISEL-NEXT: orr x13, x30, x13
+; GISEL-NEXT: str x0, [sp, #48] ; 8-byte Folded Spill
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x23, eq
+; GISEL-NEXT: csel x13, xzr, x20, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: orr x13, x20, x13
+; GISEL-NEXT: orr x13, x26, x13
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x26, eq
+; GISEL-NEXT: csel x13, xzr, x3, eq
; GISEL-NEXT: cmp x9, #3
-; GISEL-NEXT: orr x13, x2, x13
+; GISEL-NEXT: orr x13, x6, x13
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: cmp x9, #4
-; GISEL-NEXT: csel x10, x25, x10, eq
+; GISEL-NEXT: csel x10, x24, x10, eq
; GISEL-NEXT: cmp x9, #5
; GISEL-NEXT: csel x10, xzr, x10, eq
; GISEL-NEXT: cmp x9, #6
@@ -1473,8 +1478,8 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: cmp x8, #0
; GISEL-NEXT: csel x10, x12, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: str x10, [sp, #160] ; 8-byte Folded Spill
-; GISEL-NEXT: csel x10, xzr, x27, eq
+; GISEL-NEXT: str x10, [sp, #200] ; 8-byte Folded Spill
+; GISEL-NEXT: csel x10, xzr, x4, eq
; GISEL-NEXT: cmp x9, #0
; GISEL-NEXT: orr x10, x19, x10
; GISEL-NEXT: csel x10, x10, xzr, eq
@@ -1486,20 +1491,22 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: csel x12, xzr, x15, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: orr x12, x14, x12
+; GISEL-NEXT: and x15, x8, #0x3f
+; GISEL-NEXT: orr x12, x30, x12
; GISEL-NEXT: csel x10, x12, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x23, eq
+; GISEL-NEXT: csel x12, xzr, x20, eq
; GISEL-NEXT: cmp x9, #3
-; GISEL-NEXT: orr x12, x20, x12
+; GISEL-NEXT: orr x12, x26, x12
; GISEL-NEXT: csel x10, x12, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x26, eq
+; GISEL-NEXT: csel x12, xzr, x3, eq
; GISEL-NEXT: cmp x9, #4
-; GISEL-NEXT: orr x12, x2, x12
+; GISEL-NEXT: lsr x3, x11, x2
+; GISEL-NEXT: orr x12, x6, x12
; GISEL-NEXT: csel x10, x12, x10, eq
; GISEL-NEXT: cmp x9, #5
-; GISEL-NEXT: csel x10, x25, x10, eq
+; GISEL-NEXT: csel x10, x24, x10, eq
; GISEL-NEXT: cmp x9, #6
; GISEL-NEXT: csel x10, xzr, x10, eq
; GISEL-NEXT: cmp x9, #7
@@ -1522,21 +1529,23 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: cmp x9, #15
; GISEL-NEXT: csel x13, xzr, x13, eq
; GISEL-NEXT: cmp x8, #0
-; GISEL-NEXT: lsl x4, x12, x16
+; GISEL-NEXT: lsl x22, x12, x15
; GISEL-NEXT: csel x11, x11, x13, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: str x11, [sp, #152] ; 8-byte Folded Spill
+; GISEL-NEXT: str x11, [sp, #192] ; 8-byte Folded Spill
; GISEL-NEXT: csel x11, xzr, x3, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: orr x11, x4, x11
-; GISEL-NEXT: lsl x30, x10, x16
-; GISEL-NEXT: lsr x28, x10, x21
+; GISEL-NEXT: orr x11, x22, x11
+; GISEL-NEXT: lsl x5, x10, x15
+; GISEL-NEXT: lsr x27, x10, x2
; GISEL-NEXT: csel x11, x11, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x27, eq
+; GISEL-NEXT: csel x13, xzr, x4, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: str x30, [sp, #200] ; 8-byte Folded Spill
+; GISEL-NEXT: mov x25, x27
; GISEL-NEXT: orr x13, x19, x13
+; GISEL-NEXT: mov x14, x5
+; GISEL-NEXT: str x27, [sp, #328] ; 8-byte Folded Spill
; GISEL-NEXT: csel x11, x13, x11, eq
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: csel x13, xzr, x17, eq
@@ -1544,30 +1553,29 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: orr x13, x0, x13
; GISEL-NEXT: csel x11, x13, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x15, eq
+; GISEL-NEXT: csel x13, xzr, x16, eq
; GISEL-NEXT: cmp x9, #3
-; GISEL-NEXT: orr x13, x14, x13
+; GISEL-NEXT: orr x13, x30, x13
; GISEL-NEXT: csel x11, x13, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x23, eq
+; GISEL-NEXT: csel x13, xzr, x20, eq
; GISEL-NEXT: cmp x9, #4
-; GISEL-NEXT: orr x13, x20, x13
+; GISEL-NEXT: orr x13, x26, x13
; GISEL-NEXT: csel x11, x13, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x26, eq
+; GISEL-NEXT: csel x13, xzr, x7, eq
; GISEL-NEXT: cmp x9, #5
-; GISEL-NEXT: orr x13, x2, x13
+; GISEL-NEXT: orr x13, x6, x13
; GISEL-NEXT: csel x11, x13, x11, eq
; GISEL-NEXT: cmp x9, #6
-; GISEL-NEXT: lsr x13, x12, x21
-; GISEL-NEXT: csel x11, x25, x11, eq
+; GISEL-NEXT: lsr x13, x12, x2
+; GISEL-NEXT: csel x11, x24, x11, eq
; GISEL-NEXT: cmp x9, #7
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #8
-; GISEL-NEXT: mov x6, x13
+; GISEL-NEXT: mov x15, x13
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #9
-; GISEL-NEXT: str x6, [sp, #256] ; 8-byte Folded Spill
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #10
; GISEL-NEXT: csel x11, xzr, x11, eq
@@ -1584,18 +1592,18 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: cmp x8, #0
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: str x11, [sp, #144] ; 8-byte Folded Spill
+; GISEL-NEXT: str x11, [sp, #184] ; 8-byte Folded Spill
; GISEL-NEXT: csel x11, xzr, x13, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: orr x11, x30, x11
+; GISEL-NEXT: orr x11, x5, x11
; GISEL-NEXT: csel x11, x11, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: csel x12, xzr, x3, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: orr x12, x4, x12
+; GISEL-NEXT: orr x12, x22, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x27, eq
+; GISEL-NEXT: csel x12, xzr, x4, eq
; GISEL-NEXT: cmp x9, #2
; GISEL-NEXT: orr x12, x19, x12
; GISEL-NEXT: csel x11, x12, x11, eq
@@ -1605,22 +1613,22 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: orr x12, x0, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x15, eq
+; GISEL-NEXT: csel x12, xzr, x16, eq
; GISEL-NEXT: cmp x9, #4
-; GISEL-NEXT: orr x12, x14, x12
+; GISEL-NEXT: orr x12, x30, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x23, eq
+; GISEL-NEXT: csel x12, xzr, x20, eq
; GISEL-NEXT: cmp x9, #5
-; GISEL-NEXT: orr x12, x20, x12
+; GISEL-NEXT: orr x12, x26, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x26, eq
+; GISEL-NEXT: csel x12, xzr, x7, eq
; GISEL-NEXT: cmp x9, #6
-; GISEL-NEXT: orr x12, x2, x12
+; GISEL-NEXT: orr x12, x6, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: cmp x9, #7
-; GISEL-NEXT: csel x11, x25, x11, eq
+; GISEL-NEXT: csel x11, x24, x11, eq
; GISEL-NEXT: cmp x9, #8
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #9
@@ -1635,39 +1643,34 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #14
; GISEL-NEXT: csel x12, xzr, x11, eq
-; GISEL-NEXT: ldp x11, x5, [x1, #64]
+; GISEL-NEXT: ldp x11, x1, [x1, #64]
; GISEL-NEXT: cmp x9, #15
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x8, #0
; GISEL-NEXT: csel x12, x10, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: lsl x21, x11, x16
-; GISEL-NEXT: str x12, [sp, #136] ; 8-byte Folded Spill
-; GISEL-NEXT: csel x12, xzr, x28, eq
+; GISEL-NEXT: lsl x23, x11, x21
+; GISEL-NEXT: str x12, [sp, #176] ; 8-byte Folded Spill
+; GISEL-NEXT: csel x12, xzr, x27, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: orr x12, x21, x12
-; GISEL-NEXT: lsr x10, x11, x22
-; GISEL-NEXT: mov x16, x19
+; GISEL-NEXT: orr x12, x23, x12
+; GISEL-NEXT: lsr x21, x11, x2
+; GISEL-NEXT: str x23, [sp, #288] ; 8-byte Folded Spill
; GISEL-NEXT: csel x12, x12, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: mov x1, x16
; GISEL-NEXT: csel x13, xzr, x13, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: str x16, [sp, #304] ; 8-byte Folded Spill
-; GISEL-NEXT: orr x13, x30, x13
+; GISEL-NEXT: orr x13, x5, x13
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: csel x13, xzr, x3, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: lsl x3, x5, x24
-; GISEL-NEXT: orr x13, x4, x13
+; GISEL-NEXT: orr x13, x22, x13
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: stp x21, x3, [sp, #216] ; 16-byte Folded Spill
-; GISEL-NEXT: csel x13, xzr, x27, eq
+; GISEL-NEXT: csel x13, xzr, x4, eq
; GISEL-NEXT: cmp x9, #3
; GISEL-NEXT: orr x13, x19, x13
-; GISEL-NEXT: mov x19, x28
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: csel x13, xzr, x17, eq
@@ -1675,27 +1678,30 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: orr x13, x0, x13
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x15, eq
+; GISEL-NEXT: csel x13, xzr, x16, eq
; GISEL-NEXT: cmp x9, #5
-; GISEL-NEXT: orr x13, x14, x13
+; GISEL-NEXT: orr x13, x30, x13
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x23, eq
+; GISEL-NEXT: csel x13, xzr, x20, eq
; GISEL-NEXT: cmp x9, #6
-; GISEL-NEXT: orr x13, x20, x13
+; GISEL-NEXT: orr x13, x26, x13
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x26, eq
+; GISEL-NEXT: csel x13, xzr, x7, eq
; GISEL-NEXT: cmp x9, #7
-; GISEL-NEXT: orr x13, x2, x13
+; GISEL-NEXT: orr x13, x6, x13
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: cmp x9, #8
-; GISEL-NEXT: csel x12, x25, x12, eq
+; GISEL-NEXT: and x13, x8, #0x3f
+; GISEL-NEXT: csel x12, x24, x12, eq
; GISEL-NEXT: cmp x9, #9
+; GISEL-NEXT: lsl x10, x1, x13
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #10
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #11
+; GISEL-NEXT: stp x10, x15, [sp, #312] ; 16-byte Folded Spill
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #12
; GISEL-NEXT: csel x12, xzr, x12, eq
@@ -1708,69 +1714,69 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: cmp x8, #0
; GISEL-NEXT: csel x11, x11, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: str x11, [sp, #128] ; 8-byte Folded Spill
-; GISEL-NEXT: csel x11, xzr, x10, eq
+; GISEL-NEXT: str x11, [sp, #168] ; 8-byte Folded Spill
+; GISEL-NEXT: csel x11, xzr, x21, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: orr x11, x3, x11
+; GISEL-NEXT: orr x11, x10, x11
+; GISEL-NEXT: mov x10, x23
; GISEL-NEXT: csel x11, x11, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x28, eq
+; GISEL-NEXT: csel x12, xzr, x27, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: mov x28, x4
-; GISEL-NEXT: orr x12, x21, x12
-; GISEL-NEXT: str x28, [sp, #32] ; 8-byte Folded Spill
+; GISEL-NEXT: mov x27, x24
+; GISEL-NEXT: orr x12, x23, x12
+; GISEL-NEXT: mov x23, x15
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x6, eq
+; GISEL-NEXT: csel x12, xzr, x15, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: orr x12, x30, x12
+; GISEL-NEXT: mov x15, x22
+; GISEL-NEXT: orr x12, x5, x12
+; GISEL-NEXT: mov x5, x3
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x7, eq
+; GISEL-NEXT: stp x14, x5, [sp, #256] ; 16-byte Folded Spill
+; GISEL-NEXT: csel x12, xzr, x3, eq
; GISEL-NEXT: cmp x9, #3
-; GISEL-NEXT: orr x12, x4, x12
-; GISEL-NEXT: mov x4, x20
+; GISEL-NEXT: mov x5, x4
+; GISEL-NEXT: orr x12, x22, x12
+; GISEL-NEXT: lsr x22, x1, x2
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x27, eq
+; GISEL-NEXT: csel x12, xzr, x4, eq
; GISEL-NEXT: cmp x9, #4
-; GISEL-NEXT: mov x27, x2
-; GISEL-NEXT: orr x12, x16, x12
-; GISEL-NEXT: mov x16, x17
+; GISEL-NEXT: str x22, [sp, #240] ; 8-byte Folded Spill
+; GISEL-NEXT: orr x12, x19, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: csel x12, xzr, x17, eq
; GISEL-NEXT: cmp x9, #5
-; GISEL-NEXT: mov x17, x15
; GISEL-NEXT: orr x12, x0, x12
-; GISEL-NEXT: lsr x0, x5, x22
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x15, eq
+; GISEL-NEXT: csel x12, xzr, x16, eq
; GISEL-NEXT: cmp x9, #6
-; GISEL-NEXT: ldr x15, [sp, #312] ; 8-byte Folded Reload
-; GISEL-NEXT: orr x12, x14, x12
-; GISEL-NEXT: str x0, [sp, #280] ; 8-byte Folded Spill
+; GISEL-NEXT: orr x12, x30, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x23, eq
+; GISEL-NEXT: csel x12, xzr, x20, eq
; GISEL-NEXT: cmp x9, #7
-; GISEL-NEXT: mov x23, x25
-; GISEL-NEXT: orr x12, x20, x12
-; GISEL-NEXT: str x23, [sp, #288] ; 8-byte Folded Spill
+; GISEL-NEXT: orr x12, x26, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x26, eq
+; GISEL-NEXT: csel x12, xzr, x7, eq
; GISEL-NEXT: cmp x9, #8
-; GISEL-NEXT: orr x12, x2, x12
-; GISEL-NEXT: mov x2, x3
+; GISEL-NEXT: mov x7, x14
+; GISEL-NEXT: orr x12, x6, x12
+; GISEL-NEXT: mov x6, x28
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: cmp x9, #9
-; GISEL-NEXT: csel x11, x25, x11, eq
+; GISEL-NEXT: csel x11, x24, x11, eq
; GISEL-NEXT: cmp x9, #10
-; GISEL-NEXT: mov x25, x26
+; GISEL-NEXT: ldr x24, [x6, #88]
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #11
+; GISEL-NEXT: ldr x6, [sp, #272] ; 8-byte Folded Reload
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #12
; GISEL-NEXT: csel x11, xzr, x11, eq
@@ -1780,80 +1786,84 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #15
; GISEL-NEXT: csel x12, xzr, x11, eq
+; GISEL-NEXT: ldr x11, [x28, #80]
; GISEL-NEXT: cmp x8, #0
-; GISEL-NEXT: csel x12, x5, x12, eq
-; GISEL-NEXT: ldp x11, x5, [x15, #80]
+; GISEL-NEXT: csel x12, x1, x12, eq
+; GISEL-NEXT: mov x28, x2
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: str x12, [sp, #120] ; 8-byte Folded Spill
-; GISEL-NEXT: mov x15, x7
-; GISEL-NEXT: csel x12, xzr, x0, eq
+; GISEL-NEXT: lsl x2, x11, x13
+; GISEL-NEXT: str x12, [sp, #160] ; 8-byte Folded Spill
+; GISEL-NEXT: csel x12, xzr, x22, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: str x15, [sp, #24] ; 8-byte Folded Spill
-; GISEL-NEXT: lsl x20, x11, x24
-; GISEL-NEXT: orr x12, x20, x12
-; GISEL-NEXT: str x20, [sp, #232] ; 8-byte Folded Spill
+; GISEL-NEXT: ldr x1, [sp, #312] ; 8-byte Folded Reload
+; GISEL-NEXT: str x28, [sp, #16] ; 8-byte Folded Spill
+; GISEL-NEXT: orr x12, x2, x12
+; GISEL-NEXT: str x2, [sp, #280] ; 8-byte Folded Spill
; GISEL-NEXT: csel x12, x12, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x10, eq
+; GISEL-NEXT: csel x13, xzr, x21, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: orr x13, x3, x13
-; GISEL-NEXT: lsl x3, x5, x24
+; GISEL-NEXT: orr x13, x1, x13
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x19, eq
+; GISEL-NEXT: csel x13, xzr, x25, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: stp x19, x3, [sp, #264] ; 16-byte Folded Spill
-; GISEL-NEXT: orr x13, x21, x13
+; GISEL-NEXT: mov x25, x16
+; GISEL-NEXT: orr x13, x10, x13
+; GISEL-NEXT: mov x10, x30
+; GISEL-NEXT: str x25, [sp, #80] ; 8-byte Folded Spill
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x6, eq
+; GISEL-NEXT: csel x13, xzr, x23, eq
; GISEL-NEXT: cmp x9, #3
-; GISEL-NEXT: orr x13, x30, x13
+; GISEL-NEXT: mov x23, x3
+; GISEL-NEXT: orr x13, x14, x13
+; GISEL-NEXT: mov x14, x17
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x7, eq
-; GISEL-NEXT: ldp x7, x30, [sp, #240] ; 16-byte Folded Reload
+; GISEL-NEXT: stp x19, x14, [sp, #64] ; 16-byte Folded Spill
+; GISEL-NEXT: csel x13, xzr, x3, eq
; GISEL-NEXT: cmp x9, #4
-; GISEL-NEXT: orr x13, x28, x13
+; GISEL-NEXT: mov x3, x21
+; GISEL-NEXT: orr x13, x15, x13
+; GISEL-NEXT: str x3, [sp, #32] ; 8-byte Folded Spill
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x7, eq
+; GISEL-NEXT: csel x13, xzr, x4, eq
; GISEL-NEXT: cmp x9, #5
-; GISEL-NEXT: orr x13, x1, x13
-; GISEL-NEXT: mov x1, x14
+; GISEL-NEXT: mov x4, x0
+; GISEL-NEXT: orr x13, x19, x13
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x16, eq
+; GISEL-NEXT: csel x13, xzr, x17, eq
; GISEL-NEXT: cmp x9, #6
-; GISEL-NEXT: orr x13, x30, x13
+; GISEL-NEXT: mov x17, x27
+; GISEL-NEXT: orr x13, x0, x13
+; GISEL-NEXT: ldr x0, [sp, #24] ; 8-byte Folded Reload
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x17, eq
+; GISEL-NEXT: csel x13, xzr, x16, eq
; GISEL-NEXT: cmp x9, #7
-; GISEL-NEXT: orr x13, x14, x13
-; GISEL-NEXT: ldr x14, [sp, #208] ; 8-byte Folded Reload
+; GISEL-NEXT: orr x13, x30, x13
+; GISEL-NEXT: ldp x30, x16, [sp, #320] ; 16-byte Folded Reload
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x14, eq
+; GISEL-NEXT: csel x13, xzr, x20, eq
; GISEL-NEXT: cmp x9, #8
-; GISEL-NEXT: orr x13, x4, x13
-; GISEL-NEXT: mov x4, x10
+; GISEL-NEXT: orr x13, x26, x13
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x26, eq
+; GISEL-NEXT: csel x13, xzr, x6, eq
; GISEL-NEXT: cmp x9, #9
-; GISEL-NEXT: mov x26, x27
-; GISEL-NEXT: orr x13, x27, x13
-; GISEL-NEXT: lsr x27, x11, x22
+; GISEL-NEXT: orr x13, x0, x13
; GISEL-NEXT: csel x12, x13, x12, eq
; GISEL-NEXT: cmp x9, #10
-; GISEL-NEXT: mov x13, x23
-; GISEL-NEXT: csel x12, x23, x12, eq
+; GISEL-NEXT: lsr x13, x11, x28
+; GISEL-NEXT: csel x12, x27, x12, eq
; GISEL-NEXT: cmp x9, #11
-; GISEL-NEXT: str x27, [sp, #64] ; 8-byte Folded Spill
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #12
-; GISEL-NEXT: mov x23, x20
+; GISEL-NEXT: str x13, [sp, #96] ; 8-byte Folded Spill
; GISEL-NEXT: csel x12, xzr, x12, eq
; GISEL-NEXT: cmp x9, #13
; GISEL-NEXT: csel x12, xzr, x12, eq
@@ -1864,71 +1874,77 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: cmp x8, #0
; GISEL-NEXT: csel x11, x11, x12, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: str x11, [sp, #104] ; 8-byte Folded Spill
-; GISEL-NEXT: csel x11, xzr, x27, eq
+; GISEL-NEXT: str x11, [sp, #152] ; 8-byte Folded Spill
+; GISEL-NEXT: and x11, x8, #0x3f
+; GISEL-NEXT: lsl x27, x24, x11
+; GISEL-NEXT: csel x11, xzr, x13, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: orr x11, x3, x11
+; GISEL-NEXT: orr x11, x27, x11
+; GISEL-NEXT: str x27, [sp, #56] ; 8-byte Folded Spill
; GISEL-NEXT: csel x11, x11, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x0, eq
+; GISEL-NEXT: csel x12, xzr, x22, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: mov x0, x7
-; GISEL-NEXT: orr x12, x20, x12
-; GISEL-NEXT: mov x20, x16
+; GISEL-NEXT: mov x22, x2
+; GISEL-NEXT: orr x12, x2, x12
+; GISEL-NEXT: mov x2, x14
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x10, eq
+; GISEL-NEXT: csel x12, xzr, x21, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: ldr x10, [sp, #312] ; 8-byte Folded Reload
-; GISEL-NEXT: orr x12, x2, x12
-; GISEL-NEXT: ldr x2, [sp, #304] ; 8-byte Folded Reload
+; GISEL-NEXT: ldr x21, [sp, #288] ; 8-byte Folded Reload
+; GISEL-NEXT: orr x12, x1, x12
+; GISEL-NEXT: mov x1, x27
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x19, eq
+; GISEL-NEXT: csel x12, xzr, x16, eq
; GISEL-NEXT: cmp x9, #3
; GISEL-NEXT: orr x12, x21, x12
-; GISEL-NEXT: ldr x21, [sp, #200] ; 8-byte Folded Reload
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x6, eq
+; GISEL-NEXT: csel x12, xzr, x30, eq
; GISEL-NEXT: cmp x9, #4
-; GISEL-NEXT: orr x12, x21, x12
+; GISEL-NEXT: orr x12, x7, x12
+; GISEL-NEXT: mov x7, x15
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x15, eq
+; GISEL-NEXT: str x7, [sp, #40] ; 8-byte Folded Spill
+; GISEL-NEXT: csel x12, xzr, x23, eq
; GISEL-NEXT: cmp x9, #5
-; GISEL-NEXT: orr x12, x28, x12
+; GISEL-NEXT: orr x12, x15, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x7, eq
+; GISEL-NEXT: csel x12, xzr, x5, eq
; GISEL-NEXT: cmp x9, #6
-; GISEL-NEXT: mov x7, x17
-; GISEL-NEXT: orr x12, x2, x12
+; GISEL-NEXT: mov x5, x19
+; GISEL-NEXT: orr x12, x19, x12
+; GISEL-NEXT: mov x19, x7
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x16, eq
+; GISEL-NEXT: csel x12, xzr, x14, eq
; GISEL-NEXT: cmp x9, #7
-; GISEL-NEXT: orr x12, x30, x12
+; GISEL-NEXT: lsr x14, x24, x28
+; GISEL-NEXT: orr x12, x4, x12
+; GISEL-NEXT: mov x4, x10
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x17, eq
+; GISEL-NEXT: csel x12, xzr, x25, eq
; GISEL-NEXT: cmp x9, #8
-; GISEL-NEXT: mov x17, x24
-; GISEL-NEXT: orr x12, x1, x12
+; GISEL-NEXT: orr x12, x10, x12
+; GISEL-NEXT: ldr x10, [sp, #304] ; 8-byte Folded Reload
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x14, eq
-; GISEL-NEXT: ldr x14, [sp, #8] ; 8-byte Folded Reload
+; GISEL-NEXT: csel x12, xzr, x20, eq
; GISEL-NEXT: cmp x9, #9
-; GISEL-NEXT: orr x12, x14, x12
+; GISEL-NEXT: orr x12, x26, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x12, xzr, x25, eq
+; GISEL-NEXT: csel x12, xzr, x6, eq
; GISEL-NEXT: cmp x9, #10
-; GISEL-NEXT: orr x12, x26, x12
+; GISEL-NEXT: orr x12, x0, x12
; GISEL-NEXT: csel x11, x12, x11, eq
; GISEL-NEXT: cmp x9, #11
-; GISEL-NEXT: csel x11, x13, x11, eq
+; GISEL-NEXT: csel x11, x17, x11, eq
; GISEL-NEXT: cmp x9, #12
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #13
@@ -1937,393 +1953,395 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) {
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #15
; GISEL-NEXT: csel x12, xzr, x11, eq
+; GISEL-NEXT: ldp x11, x6, [x10, #96]
; GISEL-NEXT: cmp x8, #0
-; GISEL-NEXT: ldp x11, x10, [x10, #96]
-; GISEL-NEXT: csel x12, x5, x12, eq
-; GISEL-NEXT: str x12, [sp, #96] ; 8-byte Folded Spill
-; GISEL-NEXT: mov x12, x22
-; GISEL-NEXT: lsr x22, x5, x22
-; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: mov x5, x27
-; GISEL-NEXT: lsl x24, x11, x24
-; GISEL-NEXT: str x10, [sp, #296] ; 8-byte Folded Spill
-; GISEL-NEXT: csel x10, xzr, x22, eq
+; GISEL-NEXT: and x10, x8, #0x3f
+; GISEL-NEXT: csel x12, x24, x12, eq
+; GISEL-NEXT: tst x8, #0x3f
+; GISEL-NEXT: ldr x24, [sp, #248] ; 8-byte Folded Reload
+; GISEL-NEXT: lsl x15, x11, x10
+; GISEL-NEXT: csel x10, xzr, x14, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: str x22, [sp, #16] ; 8-byte Folded Spill
-; GISEL-NEXT: orr x10, x24, x10
+; GISEL-NEXT: str x12, [sp, #136] ; 8-byte Folded Spill
+; GISEL-NEXT: ldr x12, [sp, #312] ; 8-byte Folded Reload
+; GISEL-NEXT: orr x10, x15, x10
+; GISEL-NEXT: str x15, [sp, #296] ; 8-byte Folded Spill
+; GISEL-NEXT: mov x15, x13
; GISEL-NEXT: csel x10, x10, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x27, eq
+; GISEL-NEXT: csel x13, xzr, x13, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: ldr x27, [sp, #280] ; 8-byte Folded Reload
-; GISEL-NEXT: orr x13, x3, x13
-; GISEL-NEXT: mov x3, x26
+; GISEL-NEXT: orr x13, x27, x13
+; GISEL-NEXT: ldr x27, [sp, #240] ; 8-byte Folded Reload
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: csel x13, xzr, x27, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: orr x13, x23, x13
-; GISEL-NEXT: mov x23, x4
+; GISEL-NEXT: orr x13, x22, x13
+; GISEL-NEXT: ldr x22, [sp, #272] ; 8-byte Folded Reload
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x4, eq
-; GISEL-NEXT: ldp x4, x16, [sp, #216] ; 16-byte Folded Reload
+; GISEL-NEXT: csel x13, xzr, x3, eq
; GISEL-NEXT: cmp x9, #3
-; GISEL-NEXT: orr x13, x16, x13
+; GISEL-NEXT: orr x13, x12, x13
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x19, eq
+; GISEL-NEXT: csel x13, xzr, x16, eq
; GISEL-NEXT: cmp x9, #4
-; GISEL-NEXT: mov x19, x1
-; GISEL-NEXT: orr x13, x4, x13
+; GISEL-NEXT: mov x16, x17
+; GISEL-NEXT: orr x13, x21, x13
+; GISEL-NEXT: ldp x23, x21, [sp, #256] ; 16-byte Folded Reload
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x6, eq
+; GISEL-NEXT: csel x13, xzr, x30, eq
; GISEL-NEXT: cmp x9, #5
-; GISEL-NEXT: mov x6, x14
-; GISEL-NEXT: orr x13, x21, x13
+; GISEL-NEXT: mov x30, x0
+; GISEL-NEXT: orr x13, x23, x13
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x15, eq
+; GISEL-NEXT: csel x13, xzr, x21, eq
; GISEL-NEXT: cmp x9, #6
-; GISEL-NEXT: orr x13, x28, x13
+; GISEL-NEXT: orr x13, x7, x13
+; GISEL-NEXT: mov x7, x14
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x0, eq
+; GISEL-NEXT: csel x13, xzr, x24, eq
; GISEL-NEXT: cmp x9, #7
-; GISEL-NEXT: mov x0, x23
-; GISEL-NEXT: orr x13, x2, x13
+; GISEL-NEXT: orr x13, x5, x13
+; GISEL-NEXT: ldr x5, [sp, #48] ; 8-byte Folded Reload
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x20, eq
+; GISEL-NEXT: csel x13, xzr, x2, eq
; GISEL-NEXT: cmp x9, #8
-; GISEL-NEXT: orr x13, x30, x13
-; GISEL-NEXT: ldr x30, [sp, #208] ; 8-byte Folded Reload
+; GISEL-NEXT: ldr x2, [sp, #296] ; 8-byte Folded Reload
+; GISEL-NEXT: orr x13, x5, x13
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x7, eq
+; GISEL-NEXT: csel x13, xzr, x25, eq
; GISEL-NEXT: cmp x9, #9
-; GISEL-NEXT: orr x13, x1, x13
+; GISEL-NEXT: mov x25, x6
+; GISEL-NEXT: orr x13, x4, x13
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x30, eq
+; GISEL-NEXT: csel x13, xzr, x20, eq
; GISEL-NEXT: cmp x9, #10
-; GISEL-NEXT: orr x13, x14, x13
-; GISEL-NEXT: ldp x14, x2, [sp, #264] ; 16-byte Folded Reload
+; GISEL-NEXT: orr x13, x26, x13
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x13, xzr, x25, eq
+; GISEL-NEXT: csel x13, xzr, x22, eq
; GISEL-NEXT: cmp x9, #11
-; GISEL-NEXT: orr x13, x26, x13
-; GISEL-NEXT: ldr x26, [sp, #288] ; 8-byte Folded Reload
+; GISEL-NEXT: orr x13, x0, x13
; GISEL-NEXT: csel x10, x13, x10, eq
; GISEL-NEXT: cmp x9, #12
-; GISEL-NEXT: lsr x13, x11, x12
-; GISEL-NEXT: csel x10, x26, x10, eq
+; GISEL-NEXT: lsr x13, x11, x28
+; GISEL-NEXT: csel x10, x17, x10, eq
; GISEL-NEXT: cmp x9, #13
+; GISEL-NEXT: ldr x17, [sp, #80] ; 8-byte Folded Reload
; GISEL-NEXT: csel x10, xzr, x10, eq
; GISEL-NEXT: cmp x9, #14
-; GISEL-NEXT: str x13, [sp, #72] ; 8-byte Folded Spill
+; GISEL-NEXT: str x13, [sp, #104] ; 8-byte Folded Spill
; GISEL-NEXT: csel x10, xzr, x10, eq
; GISEL-NEXT: cmp x9, #15
; GISEL-NEXT: csel x10, xzr, x10, eq
; GISEL-NEXT: cmp x8, #0
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: str x10, [sp, #88] ; 8-byte Folded Spill
-; GISEL-NEXT: ldr x10, [sp, #296] ; 8-byte Folded Reload
-; GISEL-NEXT: lsl x11, x10, x17
+; GISEL-NEXT: str x10, [sp, #128] ; 8-byte Folded Spill
+; GISEL-NEXT: and x10, x8, #0x3f
+; GISEL-NEXT: lsl x11, x6, x10
; GISEL-NEXT: csel x10, xzr, x13, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: ldr x17, [sp, #232] ; 8-byte Folded Reload
-; GISEL-NEXT: ldr x13, [sp, #256] ; 8-byte Folded Reload
+; GISEL-NEXT: ldp x0, x13, [sp, #280] ; 16-byte Folded Reload
+; GISEL-NEXT: mov x6, x16
; GISEL-NEXT: orr x10, x11, x10
-; GISEL-NEXT: str x11, [sp, #56] ; 8-byte Folded Spill
+; GISEL-NEXT: str x11, [sp, #88] ; 8-byte Folded Spill
; GISEL-NEXT: csel x10, x10, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x22, eq
+; GISEL-NEXT: csel x11, xzr, x14, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: orr x11, x24, x11
+; GISEL-NEXT: orr x11, x2, x11
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x5, eq
+; GISEL-NEXT: csel x11, xzr, x15, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: orr x11, x2, x11
-; GISEL-NEXT: ldp x12, x5, [sp, #240] ; 16-byte Folded Reload
+; GISEL-NEXT: mov x15, x3
+; GISEL-NEXT: orr x11, x1, x11
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: csel x11, xzr, x27, eq
; GISEL-NEXT: cmp x9, #3
-; GISEL-NEXT: mov x27, x30
-; GISEL-NEXT: orr x11, x17, x11
+; GISEL-NEXT: orr x11, x0, x11
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x23, eq
+; GISEL-NEXT: csel x11, xzr, x3, eq
+; GISEL-NEXT: ldp x14, x3, [sp, #320] ; 16-byte Folded Reload
; GISEL-NEXT: cmp x9, #4
-; GISEL-NEXT: mov x23, x20
-; GISEL-NEXT: orr x11, x16, x11
-; GISEL-NEXT: ldr x16, [sp, #304] ; 8-byte Folded Reload
+; GISEL-NEXT: orr x11, x12, x11
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x14, eq
+; GISEL-NEXT: csel x11, xzr, x3, eq
; GISEL-NEXT: cmp x9, #5
-; GISEL-NEXT: orr x11, x4, x11
+; GISEL-NEXT: orr x11, x13, x11
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x13, eq
+; GISEL-NEXT: csel x11, xzr, x14, eq
; GISEL-NEXT: cmp x9, #6
-; GISEL-NEXT: orr x11, x21, x11
-; GISEL-NEXT: ldr x21, [sp, #296] ; 8-byte Folded Reload
+; GISEL-NEXT: orr x11, x23, x11
+; GISEL-NEXT: mov x23, x5
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x15, eq
+; GISEL-NEXT: csel x11, xzr, x21, eq
; GISEL-NEXT: cmp x9, #7
-; GISEL-NEXT: orr x11, x28, x11
+; GISEL-NEXT: mov x21, x4
+; GISEL-NEXT: orr x11, x19, x11
+; GISEL-NEXT: ldp x12, x19, [sp, #64] ; 16-byte Folded Reload
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x12, eq
+; GISEL-NEXT: csel x11, xzr, x24, eq
; GISEL-NEXT: cmp x9, #8
-; GISEL-NEXT: orr x11, x16, x11
+; GISEL-NEXT: orr x11, x12, x11
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x20, eq
+; GISEL-NEXT: csel x11, xzr, x19, eq
; GISEL-NEXT: cmp x9, #9
; GISEL-NEXT: orr x11, x5, x11
+; GISEL-NEXT: mov x5, x30
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x7, eq
+; GISEL-NEXT: csel x11, xzr, x17, eq
; GISEL-NEXT: cmp x9, #10
-; GISEL-NEXT: orr x11, x1, x11
-; GISEL-NEXT: ldr x1, [sp, #312] ; 8-byte Folded Reload
+; GISEL-NEXT: orr x11, x4, x11
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x30, eq
+; GISEL-NEXT: csel x11, xzr, x20, eq
; GISEL-NEXT: cmp x9, #11
-; GISEL-NEXT: orr x11, x6, x11
+; GISEL-NEXT: orr x11, x26, x11
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x25, eq
+; GISEL-NEXT: csel x11, xzr, x22, eq
; GISEL-NEXT: cmp x9, #12
-; GISEL-NEXT: orr x11, x3, x11
+; GISEL-NEXT: orr x11, x30, x11
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: cmp x9, #13
-; GISEL-NEXT: csel x10, x26, x10, eq
+; GISEL-NEXT: csel x10, x16, x10, eq
; GISEL-NEXT: cmp x9, #14
+; GISEL-NEXT: ldr x16, [sp, #304] ; 8-byte Folded Reload
; GISEL-NEXT: csel x10, xzr, x10, eq
; GISEL-NEXT: cmp x9, #15
; GISEL-NEXT: csel x11, xzr, x10, eq
; GISEL-NEXT: cmp x8, #0
-; GISEL-NEXT: csel x11, x21, x11, eq
-; GISEL-NEXT: ldp x10, x20, [x1, #112]
-; GISEL-NEXT: str x11, [sp, #80] ; 8-byte Folded Spill
-; GISEL-NEXT: ldp x11, x4, [sp, #40] ; 16-byte Folded Reload
+; GISEL-NEXT: ldp x10, x4, [x16, #112]
+; GISEL-NEXT: csel x11, x25, x11, eq
+; GISEL-NEXT: str x11, [sp, #120] ; 8-byte Folded Spill
+; GISEL-NEXT: lsr x11, x25, x28
+; GISEL-NEXT: and x16, x8, #0x3f
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: lsr x21, x21, x4
-; GISEL-NEXT: lsl x28, x10, x11
-; GISEL-NEXT: csel x1, xzr, x21, eq
-; GISEL-NEXT: str x21, [sp, #296] ; 8-byte Folded Spill
+; GISEL-NEXT: ldr x25, [sp, #88] ; 8-byte Folded Reload
+; GISEL-NEXT: lsl x24, x10, x16
+; GISEL-NEXT: csel x1, xzr, x11, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: orr x1, x28, x1
-; GISEL-NEXT: ldr x21, [sp, #72] ; 8-byte Folded Reload
-; GISEL-NEXT: str x28, [sp, #312] ; 8-byte Folded Spill
+; GISEL-NEXT: ldp x16, x28, [sp, #96] ; 16-byte Folded Reload
+; GISEL-NEXT: orr x1, x24, x1
; GISEL-NEXT: csel x1, x1, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: ldr x28, [sp, #56] ; 8-byte Folded Reload
-; GISEL-NEXT: csel x30, xzr, x21, eq
+; GISEL-NEXT: csel x30, xzr, x28, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: orr x30, x28, x30
+; GISEL-NEXT: orr x30, x25, x30
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x30, xzr, x22, eq
+; GISEL-NEXT: csel x30, xzr, x7, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: ldr x22, [sp, #64] ; 8-byte Folded Reload
-; GISEL-NEXT: orr x30, x24, x30
+; GISEL-NEXT: orr x30, x2, x30
+; GISEL-NEXT: ldr x2, [sp, #56] ; 8-byte Folded Reload
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x30, xzr, x22, eq
+; GISEL-NEXT: csel x30, xzr, x16, eq
; GISEL-NEXT: cmp x9, #3
; GISEL-NEXT: orr x30, x2, x30
-; GISEL-NEXT: ldr x2, [sp, #280] ; 8-byte Folded Reload
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x30, xzr, x2, eq
+; GISEL-NEXT: csel x30, xzr, x27, eq
; GISEL-NEXT: cmp x9, #4
-; GISEL-NEXT: orr x30, x17, x30
-; GISEL-NEXT: ldr x17, [sp, #224] ; 8-byte Folded Reload
+; GISEL-NEXT: mov x27, x13
+; GISEL-NEXT: orr x30, x0, x30
+; GISEL-NEXT: ldr x0, [sp, #248] ; 8-byte Folded Reload
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x30, xzr, x0, eq
+; GISEL-NEXT: csel x30, xzr, x15, eq
+; GISEL-NEXT: ldr x15, [sp, #312] ; 8-byte Folded Reload
; GISEL-NEXT: cmp x9, #5
-; GISEL-NEXT: orr x30, x17, x30
+; GISEL-NEXT: orr x30, x15, x30
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x30, xzr, x14, eq
-; GISEL-NEXT: ldr x14, [sp, #216] ; 8-byte Folded Reload
+; GISEL-NEXT: csel x30, xzr, x3, eq
; GISEL-NEXT: cmp x9, #6
-; GISEL-NEXT: orr x30, x14, x30
+; GISEL-NEXT: ldr x3, [sp, #40] ; 8-byte Folded Reload
+; GISEL-NEXT: orr x30, x13, x30
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x30, xzr, x13, eq
-; GISEL-NEXT: ldr x13, [sp, #200] ; 8-byte Folded Reload
+; GISEL-NEXT: csel x30, xzr, x14, eq
+; GISEL-NEXT: ldp x13, x14, [sp, #256] ; 16-byte Folded Reload
; GISEL-NEXT: cmp x9, #7
; GISEL-NEXT: orr x30, x13, x30
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x30, xzr, x15, eq
-; GISEL-NEXT: ldr x15, [sp, #32] ; 8-byte Folded Reload
+; GISEL-NEXT: csel x30, xzr, x14, eq
; GISEL-NEXT: cmp x9, #8
-; GISEL-NEXT: orr x30, x15, x30
+; GISEL-NEXT: orr x30, x3, x30
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x30, xzr, x12, eq
+; GISEL-NEXT: csel x30, xzr, x0, eq
; GISEL-NEXT: cmp x9, #9
-; GISEL-NEXT: orr x30, x16, x30
+; GISEL-NEXT: orr x30, x12, x30
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x30, xzr, x23, eq
+; GISEL-NEXT: csel x30, xzr, x19, eq
; GISEL-NEXT: cmp x9, #10
-; GISEL-NEXT: orr x30, x5, x30
+; GISEL-NEXT: orr x30, x23, x30
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x30, xzr, x7, eq
+; GISEL-NEXT: csel x30, xzr, x17, eq
; GISEL-NEXT: cmp x9, #11
-; GISEL-NEXT: orr x30, x19, x30
+; GISEL-NEXT: orr x30, x21, x30
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x30, xzr, x27, eq
+; GISEL-NEXT: csel x30, xzr, x20, eq
; GISEL-NEXT: cmp x9, #12
-; GISEL-NEXT: orr x30, x6, x30
+; GISEL-NEXT: mov x20, x26
+; GISEL-NEXT: orr x30, x26, x30
+; GISEL-NEXT: mov x26, x5
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x30, xzr, x25, eq
+; GISEL-NEXT: csel x30, xzr, x22, eq
; GISEL-NEXT: cmp x9, #13
-; GISEL-NEXT: orr x30, x3, x30
+; GISEL-NEXT: orr x30, x5, x30
+; GISEL-NEXT: ldr x5, [sp, #16] ; 8-byte Folded Reload
; GISEL-NEXT: csel x1, x30, x1, eq
; GISEL-NEXT: cmp x9, #14
-; GISEL-NEXT: lsr x30, x10, x4
-; GISEL-NEXT: csel x1, x26, x1, eq
+; GISEL-NEXT: csel x1, x6, x1, eq
; GISEL-NEXT: cmp x9, #15
+; GISEL-NEXT: lsr x30, x10, x5
; GISEL-NEXT: csel x1, xzr, x1, eq
; GISEL-NEXT: cmp x8, #0
-; GISEL-NEXT: csel x26, x10, x1, eq
-; GISEL-NEXT: lsl x10, x20, x11
+; GISEL-NEXT: csel x5, x10, x1, eq
+; GISEL-NEXT: and x10, x8, #0x3f
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x16, xzr, x30, eq
+; GISEL-NEXT: lsl x10, x4, x10
+; GISEL-NEXT: csel x1, xzr, x30, eq
; GISEL-NEXT: cmp x9, #0
-; GISEL-NEXT: ldr x11, [sp, #296] ; 8-byte Folded Reload
-; GISEL-NEXT: orr x10, x10, x16
-; GISEL-NEXT: ldr x16, [sp, #312] ; 8-byte Folded Reload
+; GISEL-NEXT: ldp x29, x30, [sp, #416] ; 16-byte Folded Reload
+; GISEL-NEXT: orr x10, x10, x1
+; GISEL-NEXT: ldr x1, [sp, #296] ; 8-byte Folded Reload
; GISEL-NEXT: csel x10, x10, xzr, eq
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #1
-; GISEL-NEXT: orr x11, x16, x11
-; GISEL-NEXT: ldr x16, [sp, #272] ; 8-byte Folded Reload
+; GISEL-NEXT: orr x11, x24, x11
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x21, eq
+; GISEL-NEXT: csel x11, xzr, x28, eq
; GISEL-NEXT: cmp x9, #2
-; GISEL-NEXT: orr x11, x28, x11
-; GISEL-NEXT: ldp x29, x30, [sp, #400] ; 16-byte Folded Reload
+; GISEL-NEXT: orr x11, x25, x11
; GISEL-NEXT: csel x10, x11, x10, eq
-; GISEL-NEXT: ldr x11, [sp, #16] ; 8-byte Folded Reload
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x11, eq
+; GISEL-NEXT: csel x11, xzr, x7, eq
; GISEL-NEXT: cmp x9, #3
-; GISEL-NEXT: orr x11, x24, x11
+; GISEL-NEXT: orr x11, x1, x11
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x22, eq
+; GISEL-NEXT: csel x11, xzr, x16, eq
; GISEL-NEXT: cmp x9, #4
-; GISEL-NEXT: orr x11, x16, x11
-; GISEL-NEXT: ldr x16, [sp, #232] ; 8-byte Folded Reload
+; GISEL-NEXT: ldr x16, [sp, #280] ; 8-byte Folded Reload
+; GISEL-NEXT: orr x11, x2, x11
; GISEL-NEXT: csel x10, x11, x10, eq
+; GISEL-NEXT: ldr x11, [sp, #240] ; 8-byte Folded Reload
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x2, eq
+; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #5
; GISEL-NEXT: orr x11, x16, x11
-; GISEL-NEXT: ldp x22, x21, [sp, #368] ; 16-byte Folded Reload
; GISEL-NEXT: csel x10, x11, x10, eq
+; GISEL-NEXT: ldr x11, [sp, #32] ; 8-byte Folded Reload
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x0, eq
+; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #6
-; GISEL-NEXT: orr x11, x17, x11
+; GISEL-NEXT: orr x11, x15, x11
; GISEL-NEXT: csel x10, x11, x10, eq
-; GISEL-NEXT: ldr x11, [sp, #264] ; 8-byte Folded Reload
+; GISEL-NEXT: ldr x11, [sp, #328] ; 8-byte Folded Reload
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #7
-; GISEL-NEXT: orr x11, x14, x11
+; GISEL-NEXT: orr x11, x27, x11
+; GISEL-NEXT: ldp x28, x27, [sp, #336] ; 16-byte Folded Reload
; GISEL-NEXT: csel x10, x11, x10, eq
-; GISEL-NEXT: ldr x11, [sp, #256] ; 8-byte Folded Reload
+; GISEL-NEXT: ldr x11, [sp, #320] ; 8-byte Folded Reload
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #8
; GISEL-NEXT: orr x11, x13, x11
-; GISEL-NEXT: ldr x13, [sp, #112] ; 8-byte Folded Reload
+; GISEL-NEXT: ldr x13, [sp, #144] ; 8-byte Folded Reload
; GISEL-NEXT: csel x10, x11, x10, eq
-; GISEL-NEXT: ldr x11, [sp, #24] ; 8-byte Folded Reload
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x11, eq
+; GISEL-NEXT: csel x11, xzr, x14, eq
; GISEL-NEXT: cmp x9, #9
-; GISEL-NEXT: orr x11, x15, x11
+; GISEL-NEXT: orr x11, x3, x11
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: tst x8, #0x3f
-; GISEL-NEXT: csel x11, xzr, x12, eq
-; GISEL-NEXT: ldr x12, [sp, #304] ; 8-byte Folded Reload
+; GISEL-NEXT: csel x11, xzr, x0, eq
; GISEL-NEXT: cmp x9, #10
; GISEL-NEXT: orr x11, x12, x11
; GISEL-NEXT: csel x10, x11, x10, eq
-; GISEL-NEXT: ldr x11, [sp, #192] ; 8-byte Folded Reload
+; GISEL-NEXT: ldr x11, [sp, #232] ; 8-byte Folded Reload
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: str x11, [x13]
-; GISEL-NEXT: ldp x12, x11, [sp, #176] ; 16-byte Folded Reload
+; GISEL-NEXT: ldp x12, x11, [sp, #216] ; 16-byte Folded Reload
; GISEL-NEXT: stp x11, x12, [x13, #8]
-; GISEL-NEXT: csel x11, xzr, x23, eq
+; GISEL-NEXT: csel x11, xzr, x19, eq
; GISEL-NEXT: cmp x9, #11
-; GISEL-NEXT: orr x11, x5, x11
-; GISEL-NEXT: ldp x24, x23, [sp, #352] ; 16-byte Folded Reload
+; GISEL-NEXT: orr x11, x23, x11
+; GISEL-NEXT: ldp x24, x23, [sp, #368] ; 16-byte Folded Reload
; GISEL-NEXT: csel x10, x11, x10, eq
-; GISEL-NEXT: ldr x11, [sp, #168] ; 8-byte Folded Reload
+; GISEL-NEXT: ldr x11, [sp, #208] ; 8-byte Folded Reload
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: str x11, [x13, #24]
-; GISEL-NEXT: ldp x12, x11, [sp, #152] ; 16-byte Folded Reload
+; GISEL-NEXT: ldp x12, x11, [sp, #192] ; 16-byte Folded Reload
; GISEL-NEXT: stp x11, x12, [x13, #32]
-; GISEL-NEXT: csel x11, xzr, x7, eq
+; GISEL-NEXT: csel x11, xzr, x17, eq
; GISEL-NEXT: cmp x9, #12
-; GISEL-NEXT: orr x11, x19, x11
+; GISEL-NEXT: orr x11, x21, x11
; GISEL-NEXT: csel x10, x11, x10, eq
-; GISEL-NEXT: ldr x11, [sp, #144] ; 8-byte Folded Reload
+; GISEL-NEXT: ldr x11, [sp, #184] ; 8-byte Folded Reload
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: str x11, [x13, #48]
-; GISEL-NEXT: ldp x12, x11, [sp, #128] ; 16-byte Folded Reload
+; GISEL-NEXT: ldp x12, x11, [sp, #168] ; 16-byte Folded Reload
; GISEL-NEXT: stp x11, x12, [x13, #56]
-; GISEL-NEXT: csel x11, xzr, x27, eq
+; GISEL-NEXT: ldr x11, [sp, #112] ; 8-byte Folded Reload
+; GISEL-NEXT: ldr x12, [sp, #136] ; 8-byte Folded Reload
+; GISEL-NEXT: csel x11, xzr, x11, eq
; GISEL-NEXT: cmp x9, #13
-; GISEL-NEXT: orr x11, x6, x11
-; GISEL-NEXT: ldp x28, x27, [sp, #320] ; 16-byte Folded Reload
+; GISEL-NEXT: orr x11, x20, x11
+; GISEL-NEXT: ldp x20, x19, [sp, #400] ; 16-byte Folded Reload
; GISEL-NEXT: csel x10, x11, x10, eq
-; GISEL-NEXT: ldr x11, [sp, #120] ; 8-byte Folded Reload
+; GISEL-NEXT: ldr x11, [sp, #160] ; 8-byte Folded Reload
; GISEL-NEXT: tst x8, #0x3f
; GISEL-NEXT: str x11, [x13, #72]
-; GISEL-NEXT: ldp x12, x11, [sp, #96] ; 16-byte Folded Reload
-; GISEL-NEXT: stp x11, x12, [x13, #80]
-; GISEL-NEXT: csel x11, xzr, x25, eq
+; GISEL-NEXT: ldr x11, [sp, #152] ; 8-byte Folded Reload
+; GISEL-NEXT: str x11, [x13, #80]
+; GISEL-NEXT: csel x11, xzr, x22, eq
; GISEL-NEXT: cmp x9, #14
-; GISEL-NEXT: orr x11, x3, x11
+; GISEL-NEXT: orr x11, x26, x11
+; GISEL-NEXT: ldp x22, x21, [sp, #384] ; 16-byte Folded Reload
; GISEL-NEXT: csel x10, x11, x10, eq
; GISEL-NEXT: cmp x9, #15
-; GISEL-NEXT: ldr x9, [sp, #288] ; 8-byte Folded Reload
-; GISEL-NEXT: ldr x11, [sp, #88] ; 8-byte Folded Reload
-; GISEL-NEXT: csel x9, x9, x10, eq
+; GISEL-NEXT: ldr x9, [sp, #128] ; 8-byte Folded Reload
+; GISEL-NEXT: ldp x26, x25, [sp, #352] ; 16-byte Folded Reload
+; GISEL-NEXT: stp x12, x9, [x13, #88]
+; GISEL-NEXT: csel x9, x6, x10, eq
; GISEL-NEXT: cmp x8, #0
-; GISEL-NEXT: ldr x8, [sp, #80] ; 8-byte Folded Reload
-; GISEL-NEXT: stp x11, x8, [x13, #96]
-; GISEL-NEXT: csel x8, x20, x9, eq
-; GISEL-NEXT: stp x26, x8, [x13, #112]
-; GISEL-NEXT: ldp x20, x19, [sp, #384] ; 16-byte Folded Reload
-; GISEL-NEXT: ldp x26, x25, [sp, #336] ; 16-byte Folded Reload
-; GISEL-NEXT: add sp, sp, #416
+; GISEL-NEXT: ldr x8, [sp, #120] ; 8-byte Folded Reload
+; GISEL-NEXT: stp x8, x5, [x13, #104]
+; GISEL-NEXT: csel x8, x4, x9, eq
+; GISEL-NEXT: str x8, [x13, #120]
+; GISEL-NEXT: add sp, sp, #432
; GISEL-NEXT: ret
entry:
%input_val = load i1024, ptr %input, align 128
diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
index 63c08dd..b215c51 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll
@@ -267,7 +267,7 @@ define void @larger_smull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
; CHECK-SD-NEXT: and x9, x8, #0xfffffff0
; CHECK-SD-NEXT: add x10, x2, #32
; CHECK-SD-NEXT: add x11, x0, #16
-; CHECK-SD-NEXT: mov x12, x9
+; CHECK-SD-NEXT: and x12, x8, #0xfffffff0
; CHECK-SD-NEXT: .LBB3_4: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-SD-NEXT: ldp q1, q2, [x11, #-16]
@@ -313,7 +313,7 @@ define void @larger_smull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
; CHECK-GI-NEXT: and x10, x9, #0xfffffff0
; CHECK-GI-NEXT: add x11, x2, #32
; CHECK-GI-NEXT: add x12, x0, #16
-; CHECK-GI-NEXT: mov x13, x10
+; CHECK-GI-NEXT: and x13, x9, #0xfffffff0
; CHECK-GI-NEXT: xtn v0.4h, v0.4s
; CHECK-GI-NEXT: .LBB3_3: // %vector.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
@@ -428,7 +428,7 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
; CHECK-SD-NEXT: and x9, x8, #0xfffffff0
; CHECK-SD-NEXT: add x10, x2, #32
; CHECK-SD-NEXT: add x11, x0, #16
-; CHECK-SD-NEXT: mov x12, x9
+; CHECK-SD-NEXT: and x12, x8, #0xfffffff0
; CHECK-SD-NEXT: .LBB4_4: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-SD-NEXT: ldp q1, q2, [x11, #-16]
@@ -472,7 +472,7 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr
; CHECK-GI-NEXT: and x8, x9, #0xfffffff0
; CHECK-GI-NEXT: add x10, x2, #32
; CHECK-GI-NEXT: add x11, x0, #16
-; CHECK-GI-NEXT: mov x12, x8
+; CHECK-GI-NEXT: and x12, x9, #0xfffffff0
; CHECK-GI-NEXT: .LBB4_3: // %vector.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-GI-NEXT: and w13, w1, #0xffff
@@ -596,7 +596,7 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A,
; CHECK-SD-NEXT: and x11, x10, #0xfffffff0
; CHECK-SD-NEXT: fmov s2, w9
; CHECK-SD-NEXT: add x8, x0, #8
-; CHECK-SD-NEXT: mov x12, x11
+; CHECK-SD-NEXT: and x12, x10, #0xfffffff0
; CHECK-SD-NEXT: .LBB5_5: // %vector.body
; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-SD-NEXT: ldp d3, d4, [x8, #-8]
@@ -646,10 +646,10 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A,
; CHECK-GI-NEXT: movi v0.2d, #0000000000000000
; CHECK-GI-NEXT: movi v1.2d, #0000000000000000
; CHECK-GI-NEXT: add x10, x0, #8
+; CHECK-GI-NEXT: and x11, x8, #0xfffffff0
; CHECK-GI-NEXT: sbfx w9, w9, #8, #8
; CHECK-GI-NEXT: dup v2.8h, w9
; CHECK-GI-NEXT: and x9, x8, #0xfffffff0
-; CHECK-GI-NEXT: mov x11, x9
; CHECK-GI-NEXT: .LBB5_5: // %vector.body
; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-GI-NEXT: ldp d3, d4, [x10, #-8]
diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
index 4c8e589..c23e4e1 100644
--- a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
+++ b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll
@@ -17,7 +17,7 @@ define void @fma_dup_f16(ptr noalias nocapture noundef readonly %A, half noundef
; CHECK-NEXT: and x9, x8, #0xfffffff0
; CHECK-NEXT: add x10, x1, #16
; CHECK-NEXT: add x11, x0, #16
-; CHECK-NEXT: mov x12, x9
+; CHECK-NEXT: and x12, x8, #0xfffffff0
; CHECK-NEXT: .LBB0_4: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldp q1, q4, [x10, #-16]
diff --git a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll
index f6bbdf5..1770bb9 100644
--- a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll
+++ b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll
@@ -14,7 +14,6 @@ define void @foo(i32 noundef %limit, ptr %out, ptr %y) {
; CHECK-NEXT: mov x9, xzr
; CHECK-NEXT: and x12, x10, #0xfffffff0
; CHECK-NEXT: add x13, x1, #32
-; CHECK-NEXT: add x14, x2, #16
; CHECK-NEXT: b .LBB0_3
; CHECK-NEXT: .LBB0_2: // %for.cond1.for.cond.cleanup3_crit_edge.us
; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1
@@ -27,52 +26,52 @@ define void @foo(i32 noundef %limit, ptr %out, ptr %y) {
; CHECK-NEXT: // =>This Loop Header: Depth=1
; CHECK-NEXT: // Child Loop BB0_6 Depth 2
; CHECK-NEXT: // Child Loop BB0_9 Depth 2
-; CHECK-NEXT: ldrsh w15, [x2, x9, lsl #1]
+; CHECK-NEXT: ldrsh w14, [x2, x9, lsl #1]
; CHECK-NEXT: cmp w0, #16
; CHECK-NEXT: b.hs .LBB0_5
; CHECK-NEXT: // %bb.4: // in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: mov x18, xzr
+; CHECK-NEXT: mov x17, xzr
; CHECK-NEXT: b .LBB0_8
; CHECK-NEXT: .LBB0_5: // %vector.ph
; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: dup v0.8h, w15
-; CHECK-NEXT: mov x16, x14
-; CHECK-NEXT: mov x17, x13
-; CHECK-NEXT: mov x18, x12
+; CHECK-NEXT: dup v0.8h, w14
+; CHECK-NEXT: add x15, x2, #16
+; CHECK-NEXT: mov x16, x13
+; CHECK-NEXT: and x17, x10, #0xfffffff0
; CHECK-NEXT: .LBB0_6: // %vector.body
; CHECK-NEXT: // Parent Loop BB0_3 Depth=1
; CHECK-NEXT: // => This Inner Loop Header: Depth=2
-; CHECK-NEXT: ldp q1, q4, [x16, #-16]
-; CHECK-NEXT: subs x18, x18, #16
-; CHECK-NEXT: ldp q3, q2, [x17, #-32]
-; CHECK-NEXT: add x16, x16, #32
-; CHECK-NEXT: ldp q6, q5, [x17]
+; CHECK-NEXT: ldp q1, q4, [x15, #-16]
+; CHECK-NEXT: subs x17, x17, #16
+; CHECK-NEXT: ldp q3, q2, [x16, #-32]
+; CHECK-NEXT: add x15, x15, #32
+; CHECK-NEXT: ldp q6, q5, [x16]
; CHECK-NEXT: smlal2 v2.4s, v0.8h, v1.8h
; CHECK-NEXT: smlal v3.4s, v0.4h, v1.4h
; CHECK-NEXT: smlal2 v5.4s, v0.8h, v4.8h
; CHECK-NEXT: smlal v6.4s, v0.4h, v4.4h
-; CHECK-NEXT: stp q3, q2, [x17, #-32]
-; CHECK-NEXT: stp q6, q5, [x17], #64
+; CHECK-NEXT: stp q3, q2, [x16, #-32]
+; CHECK-NEXT: stp q6, q5, [x16], #64
; CHECK-NEXT: b.ne .LBB0_6
; CHECK-NEXT: // %bb.7: // %middle.block
; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1
; CHECK-NEXT: cmp x12, x10
-; CHECK-NEXT: mov x18, x12
+; CHECK-NEXT: and x17, x10, #0xfffffff0
; CHECK-NEXT: b.eq .LBB0_2
; CHECK-NEXT: .LBB0_8: // %for.body4.us.preheader
; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1
-; CHECK-NEXT: add x16, x18, x8
-; CHECK-NEXT: add x17, x2, x18, lsl #1
-; CHECK-NEXT: sub x18, x10, x18
-; CHECK-NEXT: add x16, x1, x16, lsl #2
+; CHECK-NEXT: add x15, x17, x8
+; CHECK-NEXT: add x16, x2, x17, lsl #1
+; CHECK-NEXT: sub x17, x10, x17
+; CHECK-NEXT: add x15, x1, x15, lsl #2
; CHECK-NEXT: .LBB0_9: // %for.body4.us
; CHECK-NEXT: // Parent Loop BB0_3 Depth=1
; CHECK-NEXT: // => This Inner Loop Header: Depth=2
-; CHECK-NEXT: ldrsh w3, [x17], #2
-; CHECK-NEXT: ldr w4, [x16]
-; CHECK-NEXT: subs x18, x18, #1
-; CHECK-NEXT: madd w3, w3, w15, w4
-; CHECK-NEXT: str w3, [x16], #4
+; CHECK-NEXT: ldrsh w18, [x16], #2
+; CHECK-NEXT: ldr w3, [x15]
+; CHECK-NEXT: subs x17, x17, #1
+; CHECK-NEXT: madd w18, w18, w14, w3
+; CHECK-NEXT: str w18, [x15], #4
; CHECK-NEXT: b.ne .LBB0_9
; CHECK-NEXT: b .LBB0_2
; CHECK-NEXT: .LBB0_10: // %for.cond.cleanup
diff --git a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
index 3caac1d..74b0e69 100644
--- a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
+++ b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll
@@ -278,9 +278,9 @@ define i64 @test_and_4(i64 %x, i64 %y) {
; CHECK-GI-NEXT: .cfi_offset w19, -8
; CHECK-GI-NEXT: .cfi_offset w20, -16
; CHECK-GI-NEXT: .cfi_offset w30, -32
-; CHECK-GI-NEXT: and x20, x0, #0x3
; CHECK-GI-NEXT: mov x19, x0
-; CHECK-GI-NEXT: mov x0, x20
+; CHECK-GI-NEXT: and x20, x0, #0x3
+; CHECK-GI-NEXT: and x0, x0, #0x3
; CHECK-GI-NEXT: bl callee
; CHECK-GI-NEXT: tst x19, #0x3
; CHECK-GI-NEXT: csel x0, x20, x0, eq
diff --git a/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll b/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll
index e0f2155..58c01db 100644
--- a/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll
+++ b/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll
@@ -7,20 +7,16 @@
define void @foo(i64 %v1, i64 %v2, ptr %ptr) {
; CHECK-LABEL: foo:
; CHECK: // %bb.0:
-; CHECK-NEXT: sub sp, sp, #16
-; CHECK-NEXT: .cfi_def_cfa_offset 16
; CHECK-NEXT: add x3, x0, x1
-; CHECK-NEXT: str x3, [sp, #8] // 8-byte Folded Spill
; CHECK-NEXT: str x3, [x2, #8]
; CHECK-NEXT: ldr x3, [x2, #16]
; CHECK-NEXT: add x3, x0, x3
; CHECK-NEXT: sub x3, x3, x1
; CHECK-NEXT: str x3, [x2, #16]
-; CHECK-NEXT: ldr x3, [sp, #8] // 8-byte Folded Reload
+; CHECK-NEXT: add x3, x0, x1
; CHECK-NEXT: str x3, [x2, #24]
; CHECK-NEXT: str x0, [x2, #32]
; CHECK-NEXT: str x1, [x2, #40]
-; CHECK-NEXT: add sp, sp, #16
; CHECK-NEXT: ret
%v3 = add i64 %v1, %v2
%p1 = getelementptr i64, ptr %ptr, i64 1
diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll
index 5fc996a..0f62997 100644
--- a/llvm/test/CodeGen/AArch64/tbl-loops.ll
+++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll
@@ -23,7 +23,7 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: add x13, x1, #16
; CHECK-NEXT: add x8, x1, x10, lsl #2
; CHECK-NEXT: add x9, x0, x10
-; CHECK-NEXT: mov x14, x10
+; CHECK-NEXT: and x14, x11, #0x1fffffff8
; CHECK-NEXT: .LBB0_4: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ldp q1, q2, [x13, #-16]
@@ -194,9 +194,9 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: mov w8, #1132396544 // =0x437f0000
; CHECK-NEXT: and x10, x11, #0x1fffffffc
; CHECK-NEXT: dup v0.4s, w8
+; CHECK-NEXT: and x12, x11, #0x1fffffffc
; CHECK-NEXT: add x8, x1, x10, lsl #3
; CHECK-NEXT: add x9, x0, x10, lsl #1
-; CHECK-NEXT: mov x12, x10
; CHECK-NEXT: .LBB1_9: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld2 { v1.4s, v2.4s }, [x1], #32
@@ -341,7 +341,7 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: dup v0.4s, w8
; CHECK-NEXT: ldr q1, [x12, :lo12:.LCPI2_0]
; CHECK-NEXT: add x9, x10, x10, lsl #1
-; CHECK-NEXT: mov x12, x10
+; CHECK-NEXT: and x12, x11, #0x1fffffffc
; CHECK-NEXT: add x8, x1, x9, lsl #2
; CHECK-NEXT: add x9, x0, x9
; CHECK-NEXT: .LBB2_4: // %vector.body
@@ -597,7 +597,7 @@ define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n
; CHECK-NEXT: ldr q1, [x12, :lo12:.LCPI3_0]
; CHECK-NEXT: add x8, x1, x10, lsl #4
; CHECK-NEXT: add x9, x0, x10, lsl #2
-; CHECK-NEXT: mov x12, x10
+; CHECK-NEXT: and x12, x11, #0x1fffffffc
; CHECK-NEXT: .LBB3_9: // %vector.body
; CHECK-NEXT: // =>This Inner Loop Header: Depth=1
; CHECK-NEXT: ld4 { v2.4s, v3.4s, v4.4s, v5.4s }, [x1], #64
diff --git a/llvm/test/CodeGen/AArch64/trampoline.ll b/llvm/test/CodeGen/AArch64/trampoline.ll
index 0e68270..3e933fa 100644
--- a/llvm/test/CodeGen/AArch64/trampoline.ll
+++ b/llvm/test/CodeGen/AArch64/trampoline.ll
@@ -263,3 +263,9 @@ define i64 @func2() {
%fp = call ptr @llvm.adjust.trampoline(ptr @trampg)
ret i64 0
}
+
+; Check for the explicitly emitted .note.GNU-stack section (ELF only) in the
+; presence of trampolines.
+; UTC_ARGS: --disable
+; CHECK-LINUX: .section ".note.GNU-stack","x",@progbits
+; UTC_ARGS: --enable
diff --git a/llvm/test/CodeGen/ARM/combine-movc-sub.ll b/llvm/test/CodeGen/ARM/combine-movc-sub.ll
index ca5d089..8ca4c43 100644
--- a/llvm/test/CodeGen/ARM/combine-movc-sub.ll
+++ b/llvm/test/CodeGen/ARM/combine-movc-sub.ll
@@ -27,11 +27,11 @@ define hidden fastcc ptr @test(ptr %Search, ptr %ClauseList, i32 %Level, ptr noc
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr}
; CHECK-NEXT: sub sp, #4
-; CHECK-NEXT: sub.w r7, r2, #32
-; CHECK-NEXT: mov r8, r0
+; CHECK-NEXT: sub.w r8, r2, #32
+; CHECK-NEXT: mov r6, r0
; CHECK-NEXT: movs r0, #1
; CHECK-NEXT: mov r4, r2
-; CHECK-NEXT: add.w r6, r0, r7, lsr #5
+; CHECK-NEXT: add.w r7, r0, r8, lsr #5
; CHECK-NEXT: mov r5, r1
; CHECK-NEXT: mov.w r9, #0
; CHECK-NEXT: b .LBB0_2
@@ -44,16 +44,16 @@ define hidden fastcc ptr @test(ptr %Search, ptr %ClauseList, i32 %Level, ptr noc
; CHECK-NEXT: mov r2, r4
; CHECK-NEXT: cmp r4, #31
; CHECK-NEXT: ldr r0, [r1, #16]
-; CHECK-NEXT: add.w r0, r0, r6, lsl #2
+; CHECK-NEXT: add.w r0, r0, r7, lsl #2
; CHECK-NEXT: ldr r0, [r0, #40]
; CHECK-NEXT: it hi
-; CHECK-NEXT: andhi r2, r7, #31
+; CHECK-NEXT: andhi r2, r8, #31
; CHECK-NEXT: lsrs r0, r2
; CHECK-NEXT: lsls r0, r0, #31
; CHECK-NEXT: beq .LBB0_1
; CHECK-NEXT: @ %bb.3: @ %if.then
; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1
-; CHECK-NEXT: mov r0, r8
+; CHECK-NEXT: mov r0, r6
; CHECK-NEXT: bl foo
; CHECK-NEXT: str.w r9, [r5, #4]
; CHECK-NEXT: b .LBB0_1
diff --git a/llvm/test/CodeGen/ARM/extract-bits.ll b/llvm/test/CodeGen/ARM/extract-bits.ll
index 77deaa5..d717806 100644
--- a/llvm/test/CodeGen/ARM/extract-bits.ll
+++ b/llvm/test/CodeGen/ARM/extract-bits.ll
@@ -316,28 +316,28 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind {
;
; V7A-LABEL: bextr64_a0:
; V7A: @ %bb.0:
-; V7A-NEXT: .save {r4, lr}
-; V7A-NEXT: push {r4, lr}
-; V7A-NEXT: ldr r12, [sp, #8]
-; V7A-NEXT: mov lr, #1
+; V7A-NEXT: .save {r4, r5, r11, lr}
+; V7A-NEXT: push {r4, r5, r11, lr}
+; V7A-NEXT: ldr lr, [sp, #16]
+; V7A-NEXT: mov r5, #1
; V7A-NEXT: lsr r0, r0, r2
-; V7A-NEXT: rsb r3, r12, #32
-; V7A-NEXT: subs r4, r12, #32
-; V7A-NEXT: lsr r3, lr, r3
-; V7A-NEXT: lslpl r3, lr, r4
-; V7A-NEXT: lsl r4, lr, r12
-; V7A-NEXT: movwpl r4, #0
-; V7A-NEXT: subs r4, r4, #1
-; V7A-NEXT: sbc r12, r3, #0
-; V7A-NEXT: rsb r3, r2, #32
-; V7A-NEXT: orr r0, r0, r1, lsl r3
-; V7A-NEXT: subs r3, r2, #32
-; V7A-NEXT: lsrpl r0, r1, r3
+; V7A-NEXT: rsb r12, lr, #32
+; V7A-NEXT: subs r4, lr, #32
+; V7A-NEXT: lsr r3, r5, r12
+; V7A-NEXT: lslpl r3, r5, r4
+; V7A-NEXT: lsl r5, r5, lr
+; V7A-NEXT: movwpl r5, #0
+; V7A-NEXT: rsb r4, r2, #32
+; V7A-NEXT: subs r5, r5, #1
+; V7A-NEXT: sbc r3, r3, #0
+; V7A-NEXT: orr r0, r0, r1, lsl r4
+; V7A-NEXT: subs r4, r2, #32
+; V7A-NEXT: lsrpl r0, r1, r4
; V7A-NEXT: lsr r1, r1, r2
; V7A-NEXT: movwpl r1, #0
-; V7A-NEXT: and r0, r4, r0
-; V7A-NEXT: and r1, r12, r1
-; V7A-NEXT: pop {r4, pc}
+; V7A-NEXT: and r0, r5, r0
+; V7A-NEXT: and r1, r3, r1
+; V7A-NEXT: pop {r4, r5, r11, pc}
;
; V7A-T-LABEL: bextr64_a0:
; V7A-T: @ %bb.0:
@@ -434,28 +434,28 @@ define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) n
;
; V7A-LABEL: bextr64_a0_arithmetic:
; V7A: @ %bb.0:
-; V7A-NEXT: .save {r4, lr}
-; V7A-NEXT: push {r4, lr}
-; V7A-NEXT: ldr r12, [sp, #8]
-; V7A-NEXT: mov lr, #1
+; V7A-NEXT: .save {r4, r5, r11, lr}
+; V7A-NEXT: push {r4, r5, r11, lr}
+; V7A-NEXT: ldr lr, [sp, #16]
+; V7A-NEXT: mov r5, #1
; V7A-NEXT: lsr r0, r0, r2
-; V7A-NEXT: rsb r3, r12, #32
-; V7A-NEXT: subs r4, r12, #32
-; V7A-NEXT: lsr r3, lr, r3
-; V7A-NEXT: lslpl r3, lr, r4
-; V7A-NEXT: lsl r4, lr, r12
-; V7A-NEXT: movwpl r4, #0
-; V7A-NEXT: subs r4, r4, #1
-; V7A-NEXT: sbc r12, r3, #0
-; V7A-NEXT: rsb r3, r2, #32
-; V7A-NEXT: orr r0, r0, r1, lsl r3
-; V7A-NEXT: subs r3, r2, #32
+; V7A-NEXT: rsb r12, lr, #32
+; V7A-NEXT: subs r4, lr, #32
+; V7A-NEXT: lsr r3, r5, r12
+; V7A-NEXT: lslpl r3, r5, r4
+; V7A-NEXT: lsl r5, r5, lr
+; V7A-NEXT: movwpl r5, #0
+; V7A-NEXT: rsb r4, r2, #32
+; V7A-NEXT: subs r5, r5, #1
+; V7A-NEXT: sbc r3, r3, #0
+; V7A-NEXT: orr r0, r0, r1, lsl r4
+; V7A-NEXT: subs r4, r2, #32
; V7A-NEXT: asr r2, r1, r2
-; V7A-NEXT: asrpl r0, r1, r3
; V7A-NEXT: asrpl r2, r1, #31
-; V7A-NEXT: and r0, r4, r0
-; V7A-NEXT: and r1, r12, r2
-; V7A-NEXT: pop {r4, pc}
+; V7A-NEXT: asrpl r0, r1, r4
+; V7A-NEXT: and r1, r3, r2
+; V7A-NEXT: and r0, r5, r0
+; V7A-NEXT: pop {r4, r5, r11, pc}
;
; V7A-T-LABEL: bextr64_a0_arithmetic:
; V7A-T: @ %bb.0:
@@ -911,28 +911,28 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits)
;
; V7A-LABEL: bextr64_a4_commutative:
; V7A: @ %bb.0:
-; V7A-NEXT: .save {r4, lr}
-; V7A-NEXT: push {r4, lr}
-; V7A-NEXT: ldr r12, [sp, #8]
-; V7A-NEXT: mov lr, #1
+; V7A-NEXT: .save {r4, r5, r11, lr}
+; V7A-NEXT: push {r4, r5, r11, lr}
+; V7A-NEXT: ldr lr, [sp, #16]
+; V7A-NEXT: mov r5, #1
; V7A-NEXT: lsr r0, r0, r2
-; V7A-NEXT: rsb r3, r12, #32
-; V7A-NEXT: subs r4, r12, #32
-; V7A-NEXT: lsr r3, lr, r3
-; V7A-NEXT: lslpl r3, lr, r4
-; V7A-NEXT: lsl r4, lr, r12
-; V7A-NEXT: movwpl r4, #0
-; V7A-NEXT: subs r4, r4, #1
-; V7A-NEXT: sbc r12, r3, #0
-; V7A-NEXT: rsb r3, r2, #32
-; V7A-NEXT: orr r0, r0, r1, lsl r3
-; V7A-NEXT: subs r3, r2, #32
-; V7A-NEXT: lsrpl r0, r1, r3
+; V7A-NEXT: rsb r12, lr, #32
+; V7A-NEXT: subs r4, lr, #32
+; V7A-NEXT: lsr r3, r5, r12
+; V7A-NEXT: lslpl r3, r5, r4
+; V7A-NEXT: lsl r5, r5, lr
+; V7A-NEXT: movwpl r5, #0
+; V7A-NEXT: rsb r4, r2, #32
+; V7A-NEXT: subs r5, r5, #1
+; V7A-NEXT: sbc r3, r3, #0
+; V7A-NEXT: orr r0, r0, r1, lsl r4
+; V7A-NEXT: subs r4, r2, #32
+; V7A-NEXT: lsrpl r0, r1, r4
; V7A-NEXT: lsr r1, r1, r2
; V7A-NEXT: movwpl r1, #0
-; V7A-NEXT: and r0, r0, r4
-; V7A-NEXT: and r1, r1, r12
-; V7A-NEXT: pop {r4, pc}
+; V7A-NEXT: and r0, r0, r5
+; V7A-NEXT: and r1, r1, r3
+; V7A-NEXT: pop {r4, r5, r11, pc}
;
; V7A-T-LABEL: bextr64_a4_commutative:
; V7A-T: @ %bb.0:
@@ -3456,22 +3456,22 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun
; V7M-NEXT: uxtb r2, r2
; V7M-NEXT: it pl
; V7M-NEXT: movpl r1, #0
-; V7M-NEXT: rsb.w r12, r2, #32
+; V7M-NEXT: rsb.w r3, r2, #32
; V7M-NEXT: lsls r1, r2
-; V7M-NEXT: sub.w r3, r2, #32
-; V7M-NEXT: lsr.w r4, r0, r12
+; V7M-NEXT: sub.w r12, r2, #32
+; V7M-NEXT: lsr.w r4, r0, r3
; V7M-NEXT: orrs r1, r4
-; V7M-NEXT: cmp r3, #0
+; V7M-NEXT: cmp.w r12, #0
; V7M-NEXT: it pl
-; V7M-NEXT: lslpl.w r1, r0, r3
+; V7M-NEXT: lslpl.w r1, r0, r12
; V7M-NEXT: lsl.w r0, r0, r2
-; V7M-NEXT: lsl.w r4, r1, r12
+; V7M-NEXT: lsl.w r3, r1, r3
; V7M-NEXT: it pl
; V7M-NEXT: movpl r0, #0
; V7M-NEXT: lsr.w r0, r0, r2
-; V7M-NEXT: orr.w r0, r0, r4
+; V7M-NEXT: orr.w r0, r0, r3
; V7M-NEXT: it pl
-; V7M-NEXT: lsrpl.w r0, r1, r3
+; V7M-NEXT: lsrpl.w r0, r1, r12
; V7M-NEXT: lsr.w r1, r1, r2
; V7M-NEXT: it pl
; V7M-NEXT: movpl r1, #0
@@ -3715,26 +3715,26 @@ define i64 @bextr64_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n
; V7M-NEXT: uxtb r2, r2
; V7M-NEXT: lsl.w r0, lr, r0
; V7M-NEXT: orr.w r0, r0, r12
-; V7M-NEXT: rsb.w r12, r2, #32
+; V7M-NEXT: sub.w r12, r2, #32
; V7M-NEXT: it pl
; V7M-NEXT: lsrpl.w r0, lr, r3
; V7M-NEXT: it pl
; V7M-NEXT: movpl r1, #0
+; V7M-NEXT: rsb.w r3, r2, #32
; V7M-NEXT: lsls r1, r2
-; V7M-NEXT: sub.w r3, r2, #32
-; V7M-NEXT: lsr.w r4, r0, r12
-; V7M-NEXT: orrs r1, r4
-; V7M-NEXT: cmp r3, #0
+; V7M-NEXT: cmp.w r12, #0
+; V7M-NEXT: lsr.w r4, r0, r3
+; V7M-NEXT: orr.w r1, r1, r4
; V7M-NEXT: it pl
-; V7M-NEXT: lslpl.w r1, r0, r3
+; V7M-NEXT: lslpl.w r1, r0, r12
; V7M-NEXT: lsl.w r0, r0, r2
-; V7M-NEXT: lsl.w r4, r1, r12
; V7M-NEXT: it pl
; V7M-NEXT: movpl r0, #0
+; V7M-NEXT: lsl.w r3, r1, r3
; V7M-NEXT: lsr.w r0, r0, r2
-; V7M-NEXT: orr.w r0, r0, r4
+; V7M-NEXT: orr.w r0, r0, r3
; V7M-NEXT: it pl
-; V7M-NEXT: lsrpl.w r0, r1, r3
+; V7M-NEXT: lsrpl.w r0, r1, r12
; V7M-NEXT: lsr.w r1, r1, r2
; V7M-NEXT: it pl
; V7M-NEXT: movpl r1, #0
diff --git a/llvm/test/CodeGen/ARM/extract-lowbits.ll b/llvm/test/CodeGen/ARM/extract-lowbits.ll
index b483793..373d998 100644
--- a/llvm/test/CodeGen/ARM/extract-lowbits.ll
+++ b/llvm/test/CodeGen/ARM/extract-lowbits.ll
@@ -243,15 +243,15 @@ define i64 @bzhi64_a0(i64 %val, i64 %numlowbits) nounwind {
; V7A: @ %bb.0:
; V7A-NEXT: .save {r11, lr}
; V7A-NEXT: push {r11, lr}
-; V7A-NEXT: rsb r3, r2, #32
-; V7A-NEXT: mov r12, #1
-; V7A-NEXT: lsr lr, r12, r3
+; V7A-NEXT: rsb r12, r2, #32
+; V7A-NEXT: mov lr, #1
; V7A-NEXT: subs r3, r2, #32
-; V7A-NEXT: lsl r2, r12, r2
+; V7A-NEXT: lsl r2, lr, r2
+; V7A-NEXT: lsr r12, lr, r12
; V7A-NEXT: movwpl r2, #0
-; V7A-NEXT: lslpl lr, r12, r3
+; V7A-NEXT: lslpl r12, lr, r3
; V7A-NEXT: subs r2, r2, #1
-; V7A-NEXT: sbc r3, lr, #0
+; V7A-NEXT: sbc r3, r12, #0
; V7A-NEXT: and r0, r2, r0
; V7A-NEXT: and r1, r3, r1
; V7A-NEXT: pop {r11, pc}
@@ -323,15 +323,15 @@ define i64 @bzhi64_a0_masked(i64 %val, i64 %numlowbits) nounwind {
; V7A-NEXT: .save {r11, lr}
; V7A-NEXT: push {r11, lr}
; V7A-NEXT: and r2, r2, #63
-; V7A-NEXT: mov r12, #1
-; V7A-NEXT: rsb r3, r2, #32
-; V7A-NEXT: lsr lr, r12, r3
+; V7A-NEXT: mov lr, #1
+; V7A-NEXT: rsb r12, r2, #32
; V7A-NEXT: subs r3, r2, #32
-; V7A-NEXT: lsl r2, r12, r2
+; V7A-NEXT: lsl r2, lr, r2
+; V7A-NEXT: lsr r12, lr, r12
; V7A-NEXT: movwpl r2, #0
-; V7A-NEXT: lslpl lr, r12, r3
+; V7A-NEXT: lslpl r12, lr, r3
; V7A-NEXT: subs r2, r2, #1
-; V7A-NEXT: sbc r3, lr, #0
+; V7A-NEXT: sbc r3, r12, #0
; V7A-NEXT: and r0, r2, r0
; V7A-NEXT: and r1, r3, r1
; V7A-NEXT: pop {r11, pc}
@@ -404,15 +404,15 @@ define i64 @bzhi64_a1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind {
; V7A: @ %bb.0:
; V7A-NEXT: .save {r11, lr}
; V7A-NEXT: push {r11, lr}
-; V7A-NEXT: rsb r3, r2, #32
-; V7A-NEXT: mov r12, #1
-; V7A-NEXT: lsr lr, r12, r3
+; V7A-NEXT: rsb r12, r2, #32
+; V7A-NEXT: mov lr, #1
; V7A-NEXT: subs r3, r2, #32
-; V7A-NEXT: lsl r2, r12, r2
+; V7A-NEXT: lsl r2, lr, r2
+; V7A-NEXT: lsr r12, lr, r12
; V7A-NEXT: movwpl r2, #0
-; V7A-NEXT: lslpl lr, r12, r3
+; V7A-NEXT: lslpl r12, lr, r3
; V7A-NEXT: subs r2, r2, #1
-; V7A-NEXT: sbc r3, lr, #0
+; V7A-NEXT: sbc r3, r12, #0
; V7A-NEXT: and r0, r2, r0
; V7A-NEXT: and r1, r3, r1
; V7A-NEXT: pop {r11, pc}
@@ -644,15 +644,15 @@ define i64 @bzhi64_a4_commutative(i64 %val, i64 %numlowbits) nounwind {
; V7A: @ %bb.0:
; V7A-NEXT: .save {r11, lr}
; V7A-NEXT: push {r11, lr}
-; V7A-NEXT: rsb r3, r2, #32
-; V7A-NEXT: mov r12, #1
-; V7A-NEXT: lsr lr, r12, r3
+; V7A-NEXT: rsb r12, r2, #32
+; V7A-NEXT: mov lr, #1
; V7A-NEXT: subs r3, r2, #32
-; V7A-NEXT: lsl r2, r12, r2
+; V7A-NEXT: lsl r2, lr, r2
+; V7A-NEXT: lsr r12, lr, r12
; V7A-NEXT: movwpl r2, #0
-; V7A-NEXT: lslpl lr, r12, r3
+; V7A-NEXT: lslpl r12, lr, r3
; V7A-NEXT: subs r2, r2, #1
-; V7A-NEXT: sbc r3, lr, #0
+; V7A-NEXT: sbc r3, r12, #0
; V7A-NEXT: and r0, r0, r2
; V7A-NEXT: and r1, r1, r3
; V7A-NEXT: pop {r11, pc}
@@ -2144,23 +2144,23 @@ define i64 @bzhi64_d2_load(ptr %w, i64 %numlowbits) nounwind {
;
; V7A-LABEL: bzhi64_d2_load:
; V7A: @ %bb.0:
-; V7A-NEXT: .save {r5, r7, r11, lr}
-; V7A-NEXT: push {r5, r7, r11, lr}
+; V7A-NEXT: .save {r5, lr}
+; V7A-NEXT: push {r5, lr}
; V7A-NEXT: rsb r3, r2, #64
-; V7A-NEXT: ldm r0, {r0, r7}
-; V7A-NEXT: rsb r1, r3, #32
+; V7A-NEXT: ldm r0, {r0, r5}
+; V7A-NEXT: rsb r12, r3, #32
; V7A-NEXT: rsbs r2, r2, #32
-; V7A-NEXT: lsr r5, r0, r1
-; V7A-NEXT: orr r7, r5, r7, lsl r3
-; V7A-NEXT: lslpl r7, r0, r2
+; V7A-NEXT: lsr r1, r0, r12
+; V7A-NEXT: orr r1, r1, r5, lsl r3
+; V7A-NEXT: lslpl r1, r0, r2
; V7A-NEXT: lsl r0, r0, r3
; V7A-NEXT: movwpl r0, #0
; V7A-NEXT: lsr r0, r0, r3
-; V7A-NEXT: orr r0, r0, r7, lsl r1
-; V7A-NEXT: lsr r1, r7, r3
-; V7A-NEXT: lsrpl r0, r7, r2
+; V7A-NEXT: orr r0, r0, r1, lsl r12
+; V7A-NEXT: lsrpl r0, r1, r2
+; V7A-NEXT: lsr r1, r1, r3
; V7A-NEXT: movwpl r1, #0
-; V7A-NEXT: pop {r5, r7, r11, pc}
+; V7A-NEXT: pop {r5, pc}
;
; V7A-T-LABEL: bzhi64_d2_load:
; V7A-T: @ %bb.0:
@@ -2237,26 +2237,26 @@ define i64 @bzhi64_d3_load_indexzext(ptr %w, i8 %numlowbits) nounwind {
;
; V7A-LABEL: bzhi64_d3_load_indexzext:
; V7A: @ %bb.0:
-; V7A-NEXT: .save {r5, r7, r11, lr}
-; V7A-NEXT: push {r5, r7, r11, lr}
+; V7A-NEXT: .save {r5, lr}
+; V7A-NEXT: push {r5, lr}
; V7A-NEXT: rsb r1, r1, #64
-; V7A-NEXT: ldm r0, {r0, r7}
+; V7A-NEXT: ldm r0, {r0, r5}
; V7A-NEXT: uxtb r2, r1
-; V7A-NEXT: rsb r3, r2, #32
-; V7A-NEXT: lsr r5, r0, r3
-; V7A-NEXT: orr r7, r5, r7, lsl r2
+; V7A-NEXT: rsb r12, r2, #32
+; V7A-NEXT: lsr r3, r0, r12
+; V7A-NEXT: orr r3, r3, r5, lsl r2
; V7A-NEXT: mvn r5, #31
; V7A-NEXT: uxtab r1, r5, r1
; V7A-NEXT: cmp r1, #0
-; V7A-NEXT: lslpl r7, r0, r1
+; V7A-NEXT: lslpl r3, r0, r1
; V7A-NEXT: lsl r0, r0, r2
; V7A-NEXT: movwpl r0, #0
; V7A-NEXT: lsr r0, r0, r2
-; V7A-NEXT: orr r0, r0, r7, lsl r3
-; V7A-NEXT: lsrpl r0, r7, r1
-; V7A-NEXT: lsr r1, r7, r2
+; V7A-NEXT: orr r0, r0, r3, lsl r12
+; V7A-NEXT: lsrpl r0, r3, r1
+; V7A-NEXT: lsr r1, r3, r2
; V7A-NEXT: movwpl r1, #0
-; V7A-NEXT: pop {r5, r7, r11, pc}
+; V7A-NEXT: pop {r5, pc}
;
; V7A-T-LABEL: bzhi64_d3_load_indexzext:
; V7A-T: @ %bb.0:
diff --git a/llvm/test/CodeGen/ARM/llround-conv.ll b/llvm/test/CodeGen/ARM/llround-conv.ll
index 0f57e4a..f734db8 100644
--- a/llvm/test/CodeGen/ARM/llround-conv.ll
+++ b/llvm/test/CodeGen/ARM/llround-conv.ll
@@ -1,25 +1,71 @@
-; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft | FileCheck %s --check-prefix=SOFTFP
-; RUN: llc < %s -mtriple=arm-eabi -float-abi=hard | FileCheck %s --check-prefix=HARDFP
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=armv7-none-eabi -float-abi=soft | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT
+; RUN: llc < %s -mtriple=armv7-none-eabihf -mattr=+vfp2 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16
+; RUN: llc < %s -mtriple=armv8-none-eabihf -mattr=+fp-armv8,+fullfp16 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FP16
+
+define i64 @testmsxh_builtin(half %x) {
+; CHECK-SOFT-LABEL: testmsxh_builtin:
+; CHECK-SOFT: @ %bb.0: @ %entry
+; CHECK-SOFT-NEXT: .save {r11, lr}
+; CHECK-SOFT-NEXT: push {r11, lr}
+; CHECK-SOFT-NEXT: bl __aeabi_h2f
+; CHECK-SOFT-NEXT: bl llroundf
+; CHECK-SOFT-NEXT: pop {r11, pc}
+;
+; CHECK-NOFP16-LABEL: testmsxh_builtin:
+; CHECK-NOFP16: @ %bb.0: @ %entry
+; CHECK-NOFP16-NEXT: .save {r11, lr}
+; CHECK-NOFP16-NEXT: push {r11, lr}
+; CHECK-NOFP16-NEXT: vmov r0, s0
+; CHECK-NOFP16-NEXT: bl __aeabi_h2f
+; CHECK-NOFP16-NEXT: vmov s0, r0
+; CHECK-NOFP16-NEXT: bl llroundf
+; CHECK-NOFP16-NEXT: pop {r11, pc}
+;
+; CHECK-FP16-LABEL: testmsxh_builtin:
+; CHECK-FP16: @ %bb.0: @ %entry
+; CHECK-FP16-NEXT: .save {r11, lr}
+; CHECK-FP16-NEXT: push {r11, lr}
+; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0
+; CHECK-FP16-NEXT: bl llroundf
+; CHECK-FP16-NEXT: pop {r11, pc}
+entry:
+ %0 = tail call i64 @llvm.llround.i64.f16(half %x)
+ ret i64 %0
+}
-; SOFTFP-LABEL: testmsxs_builtin:
-; SOFTFP: bl llroundf
-; HARDFP-LABEL: testmsxs_builtin:
-; HARDFP: bl llroundf
define i64 @testmsxs_builtin(float %x) {
+; CHECK-LABEL: testmsxs_builtin:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r11, lr}
+; CHECK-NEXT: push {r11, lr}
+; CHECK-NEXT: bl llroundf
+; CHECK-NEXT: pop {r11, pc}
entry:
- %0 = tail call i64 @llvm.llround.f32(float %x)
+ %0 = tail call i64 @llvm.llround.i64.f32(float %x)
ret i64 %0
}
-; SOFTFP-LABEL: testmsxd_builtin:
-; SOFTFP: bl llround
-; HARDFP-LABEL: testmsxd_builtin:
-; HARDFP: bl llround
define i64 @testmsxd_builtin(double %x) {
+; CHECK-LABEL: testmsxd_builtin:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r11, lr}
+; CHECK-NEXT: push {r11, lr}
+; CHECK-NEXT: bl llround
+; CHECK-NEXT: pop {r11, pc}
entry:
- %0 = tail call i64 @llvm.llround.f64(double %x)
+ %0 = tail call i64 @llvm.llround.i64.f64(double %x)
ret i64 %0
}
-declare i64 @llvm.llround.f32(float) nounwind readnone
-declare i64 @llvm.llround.f64(double) nounwind readnone
+define i64 @testmsxq_builtin(fp128 %x) {
+; CHECK-LABEL: testmsxq_builtin:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r11, lr}
+; CHECK-NEXT: push {r11, lr}
+; CHECK-NEXT: bl llroundl
+; CHECK-NEXT: pop {r11, pc}
+entry:
+ %0 = tail call i64 @llvm.llround.i64.f128(fp128 %x)
+ ret i64 %0
+}
diff --git a/llvm/test/CodeGen/ARM/lround-conv.ll b/llvm/test/CodeGen/ARM/lround-conv.ll
index 3aaed74..03f7a0d 100644
--- a/llvm/test/CodeGen/ARM/lround-conv.ll
+++ b/llvm/test/CodeGen/ARM/lround-conv.ll
@@ -1,25 +1,47 @@
-; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft | FileCheck %s --check-prefix=SOFTFP
-; RUN: llc < %s -mtriple=arm-eabi -float-abi=hard | FileCheck %s --check-prefix=HARDFP
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=armv7-none-eabi -float-abi=soft | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT
+; RUN: llc < %s -mtriple=armv7-none-eabihf -mattr=+vfp2 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16
+; RUN: llc < %s -mtriple=armv8-none-eabihf -mattr=+fp-armv8 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FPv8
+; RUN: llc < %s -mtriple=armv8-none-eabihf -mattr=+fp-armv8,+fullfp16 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FP16
+
+;define i32 @testmswh_builtin(half %x) {
+;entry:
+; %0 = tail call i32 @llvm.lround.i32.f16(half %x)
+; ret i32 %0
+;}
-; SOFTFP-LABEL: testmsws_builtin:
-; SOFTFP: bl lroundf
-; HARDFP-LABEL: testmsws_builtin:
-; HARDFP: bl lroundf
define i32 @testmsws_builtin(float %x) {
+; CHECK-LABEL: testmsws_builtin:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: b lroundf
entry:
%0 = tail call i32 @llvm.lround.i32.f32(float %x)
ret i32 %0
}
-; SOFTFP-LABEL: testmswd_builtin:
-; SOFTFP: bl lround
-; HARDFP-LABEL: testmswd_builtin:
-; HARDFP: bl lround
define i32 @testmswd_builtin(double %x) {
+; CHECK-LABEL: testmswd_builtin:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: b lround
entry:
%0 = tail call i32 @llvm.lround.i32.f64(double %x)
ret i32 %0
}
-declare i32 @llvm.lround.i32.f32(float) nounwind readnone
-declare i32 @llvm.lround.i32.f64(double) nounwind readnone
+define i32 @testmswq_builtin(fp128 %x) {
+; CHECK-LABEL: testmswq_builtin:
+; CHECK: @ %bb.0: @ %entry
+; CHECK-NEXT: .save {r11, lr}
+; CHECK-NEXT: push {r11, lr}
+; CHECK-NEXT: bl lroundl
+; CHECK-NEXT: pop {r11, pc}
+entry:
+ %0 = tail call i32 @llvm.lround.i32.f128(fp128 %x)
+ ret i32 %0
+}
+
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; CHECK-FP16: {{.*}}
+; CHECK-FPv8: {{.*}}
+; CHECK-NOFP16: {{.*}}
+; CHECK-SOFT: {{.*}}
diff --git a/llvm/test/CodeGen/RISCV/pr69586.ll b/llvm/test/CodeGen/RISCV/pr69586.ll
index e761d3a..33b89a4 100644
--- a/llvm/test/CodeGen/RISCV/pr69586.ll
+++ b/llvm/test/CodeGen/RISCV/pr69586.ll
@@ -39,119 +39,118 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: slli a2, a2, 1
; NOREMAT-NEXT: sub sp, sp, a2
; NOREMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xf0, 0x05, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 752 + 2 * vlenb
-; NOREMAT-NEXT: mv a7, a0
-; NOREMAT-NEXT: li a0, 32
-; NOREMAT-NEXT: addi a5, a7, 512
-; NOREMAT-NEXT: addi a4, a7, 1024
-; NOREMAT-NEXT: addi a6, a7, 1536
-; NOREMAT-NEXT: li t4, 1
-; NOREMAT-NEXT: li a2, 5
+; NOREMAT-NEXT: li a7, 32
+; NOREMAT-NEXT: addi s10, a0, 512
+; NOREMAT-NEXT: addi a4, a0, 1024
+; NOREMAT-NEXT: addi a6, a0, 1536
+; NOREMAT-NEXT: li t0, 1
+; NOREMAT-NEXT: li a3, 5
; NOREMAT-NEXT: li t1, 3
-; NOREMAT-NEXT: li t0, 7
-; NOREMAT-NEXT: lui t5, 1
+; NOREMAT-NEXT: li a2, 7
+; NOREMAT-NEXT: lui t2, 1
; NOREMAT-NEXT: li s4, 9
; NOREMAT-NEXT: li s6, 11
; NOREMAT-NEXT: li s9, 13
; NOREMAT-NEXT: li ra, 15
-; NOREMAT-NEXT: lui t2, 2
+; NOREMAT-NEXT: lui a5, 2
; NOREMAT-NEXT: lui s1, 3
; NOREMAT-NEXT: lui t3, 4
; NOREMAT-NEXT: lui s0, 5
; NOREMAT-NEXT: lui s3, 6
; NOREMAT-NEXT: lui s7, 7
-; NOREMAT-NEXT: vsetvli zero, a0, e32, m2, ta, ma
-; NOREMAT-NEXT: slli t4, t4, 11
-; NOREMAT-NEXT: sd t4, 512(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: slli a3, a2, 9
-; NOREMAT-NEXT: sd a3, 504(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: vsetvli zero, a7, e32, m2, ta, ma
+; NOREMAT-NEXT: slli t0, t0, 11
+; NOREMAT-NEXT: sd t0, 512(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: slli t4, a3, 9
+; NOREMAT-NEXT: sd t4, 504(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: slli t6, t1, 10
-; NOREMAT-NEXT: slli s2, t0, 9
-; NOREMAT-NEXT: add a0, a7, t5
+; NOREMAT-NEXT: slli s2, a2, 9
+; NOREMAT-NEXT: add a7, a0, t2
; NOREMAT-NEXT: lui s11, 1
; NOREMAT-NEXT: slli s4, s4, 9
-; NOREMAT-NEXT: slli s5, a2, 10
+; NOREMAT-NEXT: slli s5, a3, 10
; NOREMAT-NEXT: slli s6, s6, 9
; NOREMAT-NEXT: slli s8, t1, 11
-; NOREMAT-NEXT: vle32.v v8, (a5)
+; NOREMAT-NEXT: vle32.v v8, (s10)
; NOREMAT-NEXT: slli s9, s9, 9
; NOREMAT-NEXT: li t5, 13
; NOREMAT-NEXT: vle32.v v10, (a4)
; NOREMAT-NEXT: vle32.v v2, (a4)
-; NOREMAT-NEXT: slli s10, t0, 10
+; NOREMAT-NEXT: slli s10, a2, 10
; NOREMAT-NEXT: vle32.v v0, (a6)
; NOREMAT-NEXT: vle32.v v12, (a6)
; NOREMAT-NEXT: slli ra, ra, 9
-; NOREMAT-NEXT: vle32.v v4, (a0)
-; NOREMAT-NEXT: vle32.v v20, (a0)
-; NOREMAT-NEXT: add a4, a7, t2
+; NOREMAT-NEXT: vle32.v v4, (a7)
+; NOREMAT-NEXT: vle32.v v20, (a7)
+; NOREMAT-NEXT: add a4, a0, a5
; NOREMAT-NEXT: vle32.v v6, (a4)
; NOREMAT-NEXT: vle32.v v30, (a4)
-; NOREMAT-NEXT: add a4, a7, s1
+; NOREMAT-NEXT: add a4, a0, s1
; NOREMAT-NEXT: vle32.v v28, (a4)
; NOREMAT-NEXT: vle32.v v26, (a4)
-; NOREMAT-NEXT: add a4, a7, t3
+; NOREMAT-NEXT: add a4, a0, t3
; NOREMAT-NEXT: vle32.v v24, (a4)
; NOREMAT-NEXT: vle32.v v22, (a4)
-; NOREMAT-NEXT: add a4, a7, s0
-; NOREMAT-NEXT: vle32.v v14, (a7)
+; NOREMAT-NEXT: add a4, a0, s0
+; NOREMAT-NEXT: vle32.v v14, (a0)
; NOREMAT-NEXT: vle32.v v18, (a4)
; NOREMAT-NEXT: vle32.v v16, (a4)
-; NOREMAT-NEXT: add a4, a7, s3
+; NOREMAT-NEXT: add a4, a0, s3
; NOREMAT-NEXT: sf.vc.vv 3, 0, v14, v8
; NOREMAT-NEXT: vle32.v v14, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v10
; NOREMAT-NEXT: vle32.v v8, (a4)
-; NOREMAT-NEXT: addi a0, sp, 640
-; NOREMAT-NEXT: vs2r.v v8, (a0) # vscale x 16-byte Folded Spill
-; NOREMAT-NEXT: add a4, a7, t4
+; NOREMAT-NEXT: addi a4, sp, 640
+; NOREMAT-NEXT: vs2r.v v8, (a4) # vscale x 16-byte Folded Spill
+; NOREMAT-NEXT: add a4, a0, t0
; NOREMAT-NEXT: vle32.v v10, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v0
; NOREMAT-NEXT: vle32.v v2, (a4)
-; NOREMAT-NEXT: add a4, a7, a3
+; NOREMAT-NEXT: add a4, a0, t4
; NOREMAT-NEXT: vle32.v v0, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v10
; NOREMAT-NEXT: vle32.v v10, (a4)
-; NOREMAT-NEXT: add a4, a7, t6
+; NOREMAT-NEXT: add a4, a0, t6
; NOREMAT-NEXT: vle32.v v12, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v0
; NOREMAT-NEXT: vle32.v v2, (a4)
-; NOREMAT-NEXT: add a4, a7, s2
+; NOREMAT-NEXT: add a4, a0, s2
; NOREMAT-NEXT: vle32.v v8, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v12
; NOREMAT-NEXT: vle32.v v12, (a4)
-; NOREMAT-NEXT: add a4, a7, s7
+; NOREMAT-NEXT: add a4, a0, s7
; NOREMAT-NEXT: vle32.v v0, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v8
; NOREMAT-NEXT: vle32.v v10, (a4)
-; NOREMAT-NEXT: add a4, a7, s4
+; NOREMAT-NEXT: add a4, a0, s4
; NOREMAT-NEXT: vle32.v v8, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4
; NOREMAT-NEXT: vle32.v v12, (a4)
-; NOREMAT-NEXT: add a4, a7, s5
+; NOREMAT-NEXT: add a4, a0, s5
; NOREMAT-NEXT: vle32.v v4, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v20, v8
; NOREMAT-NEXT: vle32.v v8, (a4)
-; NOREMAT-NEXT: add a4, a7, s6
+; NOREMAT-NEXT: add a4, a0, s6
; NOREMAT-NEXT: vle32.v v20, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4
; NOREMAT-NEXT: vle32.v v12, (a4)
-; NOREMAT-NEXT: add a4, a7, s8
+; NOREMAT-NEXT: add a4, a0, s8
; NOREMAT-NEXT: vle32.v v4, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v20
; NOREMAT-NEXT: vle32.v v8, (a4)
-; NOREMAT-NEXT: add a4, a7, s9
+; NOREMAT-NEXT: add a4, a0, s9
; NOREMAT-NEXT: vle32.v v20, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4
; NOREMAT-NEXT: vle32.v v12, (a4)
-; NOREMAT-NEXT: add a4, a7, s10
+; NOREMAT-NEXT: add a4, a0, s10
; NOREMAT-NEXT: vle32.v v4, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v20
; NOREMAT-NEXT: vle32.v v8, (a4)
-; NOREMAT-NEXT: add a4, a7, ra
+; NOREMAT-NEXT: add a4, a0, ra
; NOREMAT-NEXT: vle32.v v2, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4
; NOREMAT-NEXT: lui t4, 8
-; NOREMAT-NEXT: add a5, a7, t4
+; NOREMAT-NEXT: add a5, a0, t4
; NOREMAT-NEXT: vle32.v v20, (a5)
; NOREMAT-NEXT: vle32.v v12, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v2
@@ -159,14 +158,14 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: slli a4, a4, 9
; NOREMAT-NEXT: li s1, 17
; NOREMAT-NEXT: sd a4, 624(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a4, a7, a4
+; NOREMAT-NEXT: add a4, a0, a4
; NOREMAT-NEXT: vle32.v v8, (a4)
; NOREMAT-NEXT: vle32.v v4, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v6
; NOREMAT-NEXT: li a5, 9
; NOREMAT-NEXT: slli a4, a5, 10
; NOREMAT-NEXT: sd a4, 616(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a4, a7, a4
+; NOREMAT-NEXT: add a4, a0, a4
; NOREMAT-NEXT: vle32.v v12, (a4)
; NOREMAT-NEXT: vle32.v v6, (a4)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8
@@ -174,256 +173,257 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: slli a4, a4, 9
; NOREMAT-NEXT: li t2, 19
; NOREMAT-NEXT: sd a4, 608(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a4, a7, a4
+; NOREMAT-NEXT: add a4, a0, a4
; NOREMAT-NEXT: vle32.v v8, (a4)
; NOREMAT-NEXT: vle32.v v30, (a4)
-; NOREMAT-NEXT: slli a3, a2, 11
+; NOREMAT-NEXT: slli a3, a3, 11
; NOREMAT-NEXT: sd a3, 600(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v12
-; NOREMAT-NEXT: add a3, a7, a3
+; NOREMAT-NEXT: add a3, a0, a3
; NOREMAT-NEXT: vle32.v v12, (a3)
; NOREMAT-NEXT: vle32.v v4, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8
; NOREMAT-NEXT: li s7, 21
; NOREMAT-NEXT: slli a3, s7, 9
; NOREMAT-NEXT: sd a3, 592(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a3, a7, a3
+; NOREMAT-NEXT: add a3, a0, a3
; NOREMAT-NEXT: vle32.v v8, (a3)
; NOREMAT-NEXT: vle32.v v6, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12
; NOREMAT-NEXT: li a6, 11
; NOREMAT-NEXT: slli a3, a6, 10
; NOREMAT-NEXT: sd a3, 584(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a3, a7, a3
+; NOREMAT-NEXT: add a3, a0, a3
; NOREMAT-NEXT: vle32.v v12, (a3)
; NOREMAT-NEXT: vle32.v v30, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v8
; NOREMAT-NEXT: li s3, 23
; NOREMAT-NEXT: slli a3, s3, 9
; NOREMAT-NEXT: sd a3, 576(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a3, a7, a3
+; NOREMAT-NEXT: add a3, a0, a3
; NOREMAT-NEXT: vle32.v v8, (a3)
; NOREMAT-NEXT: vle32.v v4, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v12
; NOREMAT-NEXT: li s0, 25
; NOREMAT-NEXT: slli a3, s0, 9
; NOREMAT-NEXT: sd a3, 568(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a3, a7, a3
+; NOREMAT-NEXT: add a3, a0, a3
; NOREMAT-NEXT: vle32.v v12, (a3)
; NOREMAT-NEXT: vle32.v v6, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8
; NOREMAT-NEXT: slli a3, t5, 10
; NOREMAT-NEXT: sd a3, 560(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a3, a7, a3
+; NOREMAT-NEXT: add a3, a0, a3
; NOREMAT-NEXT: vle32.v v8, (a3)
; NOREMAT-NEXT: vle32.v v30, (a3)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v28
; NOREMAT-NEXT: li t3, 27
; NOREMAT-NEXT: slli a3, t3, 9
; NOREMAT-NEXT: sd a3, 552(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a3, a7, a3
+; NOREMAT-NEXT: add a3, a0, a3
; NOREMAT-NEXT: vle32.v v28, (a3)
; NOREMAT-NEXT: vle32.v v4, (a3)
-; NOREMAT-NEXT: slli a2, t0, 11
+; NOREMAT-NEXT: slli a2, a2, 11
; NOREMAT-NEXT: sd a2, 544(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v12
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v12, (a2)
; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8
; NOREMAT-NEXT: li t0, 29
; NOREMAT-NEXT: slli a2, t0, 9
; NOREMAT-NEXT: sd a2, 536(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v28
-; NOREMAT-NEXT: li a3, 15
-; NOREMAT-NEXT: slli a2, a3, 10
+; NOREMAT-NEXT: li a7, 15
+; NOREMAT-NEXT: slli a2, a7, 10
; NOREMAT-NEXT: sd a2, 528(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v28, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v12
; NOREMAT-NEXT: li t1, 31
; NOREMAT-NEXT: slli a2, t1, 9
; NOREMAT-NEXT: sd a2, 520(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v12, (a2)
; NOREMAT-NEXT: vle32.v v4, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v8
-; NOREMAT-NEXT: lui a4, 4
-; NOREMAT-NEXT: addi a0, a4, 512
-; NOREMAT-NEXT: sd a0, 496(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a0, a7, a0
-; NOREMAT-NEXT: vle32.v v8, (a0)
-; NOREMAT-NEXT: vle32.v v26, (a0)
+; NOREMAT-NEXT: lui a3, 4
+; NOREMAT-NEXT: addi a2, a3, 512
+; NOREMAT-NEXT: sd a2, 496(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a2, a0, a2
+; NOREMAT-NEXT: vle32.v v8, (a2)
+; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v28
; NOREMAT-NEXT: slli a2, s1, 10
; NOREMAT-NEXT: sd a2, 488(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v28, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12
-; NOREMAT-NEXT: addi a2, a4, 1536
+; NOREMAT-NEXT: addi a2, a3, 1536
; NOREMAT-NEXT: sd a2, 480(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: lui a4, 4
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v12, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
; NOREMAT-NEXT: slli a2, a5, 11
; NOREMAT-NEXT: sd a2, 472(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v24
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v24, (a2)
; NOREMAT-NEXT: vle32.v v4, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v8
; NOREMAT-NEXT: lui a5, 5
; NOREMAT-NEXT: addi a2, a5, -1536
; NOREMAT-NEXT: sd a2, 464(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v22, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v28
; NOREMAT-NEXT: slli a2, t2, 10
; NOREMAT-NEXT: sd a2, 456(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: li t2, 19
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: li a3, 19
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: vle32.v v28, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v12
; NOREMAT-NEXT: addi a2, a5, -512
; NOREMAT-NEXT: sd a2, 448(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v12, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v24
; NOREMAT-NEXT: addi a2, a5, 512
; NOREMAT-NEXT: sd a2, 440(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v24, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
; NOREMAT-NEXT: slli a2, s7, 10
; NOREMAT-NEXT: sd a2, 432(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v8
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v4, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v26
; NOREMAT-NEXT: addi a2, a5, 1536
; NOREMAT-NEXT: sd a2, 424(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v22, (a2)
; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: slli a2, a6, 11
; NOREMAT-NEXT: sd a2, 416(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v12
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v12, (a2)
; NOREMAT-NEXT: vle32.v v28, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v18
; NOREMAT-NEXT: lui a6, 6
; NOREMAT-NEXT: addi a2, a6, -1536
; NOREMAT-NEXT: sd a2, 408(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v18, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: slli a2, s3, 10
; NOREMAT-NEXT: sd a2, 400(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v16, v24
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v16, (a2)
; NOREMAT-NEXT: vle32.v v24, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8
; NOREMAT-NEXT: addi a2, a6, -512
; NOREMAT-NEXT: sd a2, 392(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v22
; NOREMAT-NEXT: addi a2, a6, 512
; NOREMAT-NEXT: sd a2, 384(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v22, (a2)
; NOREMAT-NEXT: vle32.v v4, (a2)
; NOREMAT-NEXT: slli a2, s0, 10
; NOREMAT-NEXT: sd a2, 376(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v12
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: vle32.v v2, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v18
; NOREMAT-NEXT: addi a2, a6, 1536
; NOREMAT-NEXT: sd a2, 368(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v18, (a2)
; NOREMAT-NEXT: vle32.v v28, (a2)
; NOREMAT-NEXT: slli a2, t5, 11
; NOREMAT-NEXT: sd a2, 360(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v16
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v16, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v8
; NOREMAT-NEXT: lui s0, 7
; NOREMAT-NEXT: addi a2, s0, -1536
; NOREMAT-NEXT: sd a2, 352(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v24, (a2)
; NOREMAT-NEXT: slli a2, t3, 10
; NOREMAT-NEXT: sd a2, 344(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v14
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v14, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
-; NOREMAT-NEXT: addi a0, sp, 640
-; NOREMAT-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload
+; NOREMAT-NEXT: addi a2, sp, 640
+; NOREMAT-NEXT: vl2r.v v12, (a2) # vscale x 16-byte Folded Reload
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v22
; NOREMAT-NEXT: addi a2, s0, -512
; NOREMAT-NEXT: sd a2, 336(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v22, (a2)
; NOREMAT-NEXT: vle32.v v12, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v26
; NOREMAT-NEXT: addi a2, s0, 512
; NOREMAT-NEXT: sd a2, 328(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: lui t3, 7
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v26, (a2)
; NOREMAT-NEXT: vle32.v v4, (a2)
; NOREMAT-NEXT: slli a2, t0, 10
; NOREMAT-NEXT: sd a2, 320(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v18
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v18, (a2)
; NOREMAT-NEXT: vle32.v v2, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v16
; NOREMAT-NEXT: addi a2, t3, 1536
; NOREMAT-NEXT: sd a2, 312(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v16, (a2)
; NOREMAT-NEXT: vle32.v v28, (a2)
-; NOREMAT-NEXT: slli a2, a3, 11
+; NOREMAT-NEXT: slli a2, a7, 11
; NOREMAT-NEXT: sd a2, 304(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v8, (a2)
; NOREMAT-NEXT: vle32.v v6, (a2)
; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v14
; NOREMAT-NEXT: addi a2, t4, -1536
; NOREMAT-NEXT: sd a2, 296(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v14, (a2)
; NOREMAT-NEXT: vle32.v v24, (a2)
; NOREMAT-NEXT: slli a2, t1, 10
; NOREMAT-NEXT: sd a2, 288(sp) # 8-byte Folded Spill
; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v22
-; NOREMAT-NEXT: add a2, a7, a2
+; NOREMAT-NEXT: add a2, a0, a2
; NOREMAT-NEXT: vle32.v v22, (a2)
; NOREMAT-NEXT: vle32.v v30, (a2)
-; NOREMAT-NEXT: addi a0, t4, -512
-; NOREMAT-NEXT: sd a0, 280(sp) # 8-byte Folded Spill
-; NOREMAT-NEXT: add a0, a7, a0
+; NOREMAT-NEXT: addi a2, t4, -512
+; NOREMAT-NEXT: sd a2, 280(sp) # 8-byte Folded Spill
+; NOREMAT-NEXT: add a0, a0, a2
; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v0
; NOREMAT-NEXT: vle32.v v12, (a0)
; NOREMAT-NEXT: vle32.v v0, (a0)
@@ -476,7 +476,7 @@ define void @test(ptr %0, ptr %1, i64 %2) {
; NOREMAT-NEXT: addi s11, a0, 512
; NOREMAT-NEXT: addi s7, a0, 1024
; NOREMAT-NEXT: addi s3, a0, 1536
-; NOREMAT-NEXT: slli s1, t2, 11
+; NOREMAT-NEXT: slli s1, a3, 11
; NOREMAT-NEXT: lui a0, 10
; NOREMAT-NEXT: addi t2, a0, -1536
; NOREMAT-NEXT: addi a7, a0, -1024
diff --git a/llvm/test/CodeGen/RISCV/rv64-trampoline.ll b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll
index 34d4657..c68fa59 100644
--- a/llvm/test/CodeGen/RISCV/rv64-trampoline.ll
+++ b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll
@@ -78,3 +78,10 @@ define i64 @test0(i64 %n, ptr %p) nounwind {
ret i64 %ret
}
+
+; Check for the explicitly emitted .note.GNU-stack section (ELF only) in the
+; presence of trampolines.
+; UTC_ARGS: --disable
+; RV64-LINUX: .section ".note.GNU-stack","x",@progbits
+; RV64: .section ".note.GNU-stack","x",@progbits
+; UTC_ARGS: --enable
diff --git a/llvm/test/CodeGen/SystemZ/llvm.sincos.ll b/llvm/test/CodeGen/SystemZ/llvm.sincos.ll
index 9798077..e3ed31f 100644
--- a/llvm/test/CodeGen/SystemZ/llvm.sincos.ll
+++ b/llvm/test/CodeGen/SystemZ/llvm.sincos.ll
@@ -163,9 +163,9 @@ define { <2 x fp128>, <2 x fp128> } @test_sincos_v2f128(<2 x fp128> %a) #0 {
; LINUX-NEXT: ld %f10, 8(%r3)
; LINUX-NEXT: ld %f0, 16(%r3)
; LINUX-NEXT: ld %f2, 24(%r3)
-; LINUX-NEXT: la %r3, 16(%r2)
-; LINUX-NEXT: la %r4, 48(%r2)
; LINUX-NEXT: la %r2, 176(%r15)
+; LINUX-NEXT: la %r3, 16(%r13)
+; LINUX-NEXT: la %r4, 48(%r13)
; LINUX-NEXT: std %f0, 176(%r15)
; LINUX-NEXT: std %f2, 184(%r15)
; LINUX-NEXT: brasl %r14, sincosl@PLT
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
index 6f986ce..c418038 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll
@@ -541,11 +541,11 @@ define dso_local arm_aapcs_vfpcc void @two_reductions_mul_add_v8i16(ptr nocaptur
; CHECK-NEXT: cbz r2, .LBB7_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
; CHECK-NEXT: adds r3, r2, #7
-; CHECK-NEXT: vmov.i32 q1, #0x0
-; CHECK-NEXT: bic r3, r3, #7
; CHECK-NEXT: movs r4, #1
+; CHECK-NEXT: bic r3, r3, #7
+; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: subs r3, #8
-; CHECK-NEXT: vmov q3, q1
+; CHECK-NEXT: vmov.i32 q3, #0x0
; CHECK-NEXT: add.w r12, r4, r3, lsr #3
; CHECK-NEXT: mov r3, r0
; CHECK-NEXT: mov r4, r1
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll
index 4020709..fe06601 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll
@@ -16,39 +16,40 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha(ptr noalias nocapture
; CHECK-NEXT: sub sp, #64
; CHECK-NEXT: ldrsh.w r7, [r2]
; CHECK-NEXT: cmp r7, #1
-; CHECK-NEXT: blt.w .LBB0_6
+; CHECK-NEXT: blt .LBB0_6
; CHECK-NEXT: @ %bb.2: @ %for.cond3.preheader.us.preheader
-; CHECK-NEXT: movs r2, #252
; CHECK-NEXT: ldr r4, [sp, #152]
+; CHECK-NEXT: movs r2, #252
; CHECK-NEXT: and.w r6, r2, r3, lsr #3
; CHECK-NEXT: movs r2, #120
; CHECK-NEXT: and.w r5, r2, r3, lsr #9
; CHECK-NEXT: lsls r3, r3, #3
-; CHECK-NEXT: uxtb r3, r3
; CHECK-NEXT: muls r6, r4, r6
+; CHECK-NEXT: uxtb r3, r3
; CHECK-NEXT: rsb.w r2, r4, #256
-; CHECK-NEXT: vmov.i16 q2, #0xfc
+; CHECK-NEXT: vmov.i16 q1, #0xfc
+; CHECK-NEXT: vdup.16 q0, r6
; CHECK-NEXT: mul lr, r5, r4
-; CHECK-NEXT: vdup.16 q4, r6
; CHECK-NEXT: mov.w r6, #2016
-; CHECK-NEXT: vmov.i16 q6, #0xf8
+; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill
; CHECK-NEXT: mul r5, r3, r4
; CHECK-NEXT: adds r3, r7, #7
+; CHECK-NEXT: vdup.16 q0, r6
; CHECK-NEXT: bic r3, r3, #7
-; CHECK-NEXT: vdup.16 q3, lr
+; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT: vdup.16 q0, r5
+; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT: vdup.16 q0, lr
; CHECK-NEXT: subs r3, #8
+; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
; CHECK-NEXT: movs r4, #1
-; CHECK-NEXT: vdup.16 q0, r5
-; CHECK-NEXT: lsls r1, r1, #1
+; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload
; CHECK-NEXT: add.w r3, r4, r3, lsr #3
-; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT: vmov.i16 q0, #0xf800
+; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload
+; CHECK-NEXT: lsls r1, r1, #1
; CHECK-NEXT: movs r4, #0
-; CHECK-NEXT: vdup.16 q5, r6
-; CHECK-NEXT: vmov.i16 q7, #0x78
-; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill
+; CHECK-NEXT: vmov.i16 q4, #0xf8
; CHECK-NEXT: .LBB0_3: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB0_4 Depth 2
@@ -59,37 +60,31 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha(ptr noalias nocapture
; CHECK-NEXT: @ Parent Loop BB0_3 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vctp.16 r6
-; CHECK-NEXT: subs r6, #8
+; CHECK-NEXT: vmov.i16 q5, #0xf800
; CHECK-NEXT: vpst
; CHECK-NEXT: vldrht.u16 q0, [r5]
-; CHECK-NEXT: vshr.u16 q1, q0, #3
-; CHECK-NEXT: vand q1, q1, q2
-; CHECK-NEXT: vmov q2, q4
-; CHECK-NEXT: vmla.i16 q2, q1, r2
-; CHECK-NEXT: vshr.u16 q1, q2, #5
-; CHECK-NEXT: vshl.i16 q2, q0, #3
-; CHECK-NEXT: vand q3, q1, q5
-; CHECK-NEXT: vmov q1, q7
-; CHECK-NEXT: vand q2, q2, q6
-; CHECK-NEXT: vmov q7, q6
-; CHECK-NEXT: vmov q6, q5
-; CHECK-NEXT: vmov q5, q4
-; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT: subs r6, #8
+; CHECK-NEXT: vshr.u16 q3, q0, #3
+; CHECK-NEXT: vand q3, q3, q1
+; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT: vmla.i16 q1, q3, r2
+; CHECK-NEXT: vshl.i16 q3, q0, #3
+; CHECK-NEXT: vand q3, q3, q4
+; CHECK-NEXT: vmov q4, q6
+; CHECK-NEXT: vshr.u16 q1, q1, #5
+; CHECK-NEXT: vmla.i16 q4, q3, r2
+; CHECK-NEXT: vshr.u16 q3, q4, #11
+; CHECK-NEXT: vand q1, q1, q7
+; CHECK-NEXT: vorr q1, q1, q3
; CHECK-NEXT: vshr.u16 q0, q0, #9
-; CHECK-NEXT: vmla.i16 q4, q2, r2
-; CHECK-NEXT: vshr.u16 q2, q4, #11
-; CHECK-NEXT: vmov q4, q5
-; CHECK-NEXT: vmov q5, q6
-; CHECK-NEXT: vmov q6, q7
-; CHECK-NEXT: vmov q7, q1
-; CHECK-NEXT: vorr q1, q3, q2
-; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT: vand q0, q0, q7
-; CHECK-NEXT: vmla.i16 q2, q0, r2
-; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
-; CHECK-NEXT: vand q0, q2, q0
-; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT: vmov.i16 q3, #0x78
+; CHECK-NEXT: vmov.i16 q4, #0xf8
+; CHECK-NEXT: vand q0, q0, q3
+; CHECK-NEXT: vmov q3, q2
+; CHECK-NEXT: vmla.i16 q3, q0, r2
+; CHECK-NEXT: vand q0, q3, q5
; CHECK-NEXT: vorr q0, q1, q0
+; CHECK-NEXT: vmov.i16 q1, #0xfc
; CHECK-NEXT: vpst
; CHECK-NEXT: vstrht.16 q0, [r5], #16
; CHECK-NEXT: le lr, .LBB0_4
@@ -190,7 +185,7 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: ldrsh.w r12, [r2, #2]
; CHECK-NEXT: cmp.w r12, #1
-; CHECK-NEXT: blt.w .LBB1_7
+; CHECK-NEXT: blt .LBB1_7
; CHECK-NEXT: @ %bb.1: @ %for.cond3.preheader.lr.ph
; CHECK-NEXT: ldrsh.w r2, [r2]
; CHECK-NEXT: cmp r2, #1
@@ -200,71 +195,70 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc
; CHECK-NEXT: push {r4, r5, r6, r7, lr}
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: sub sp, #80
-; CHECK-NEXT: ldr r7, [sp, #168]
+; CHECK-NEXT: ldr r7, [sp, #88]
; CHECK-NEXT: movs r5, #120
; CHECK-NEXT: lsls r6, r3, #3
; CHECK-NEXT: movs r4, #252
; CHECK-NEXT: and.w r5, r5, r3, lsr #9
; CHECK-NEXT: uxtb r6, r6
; CHECK-NEXT: and.w r3, r4, r3, lsr #3
+; CHECK-NEXT: adds r4, r2, #7
; CHECK-NEXT: muls r6, r7, r6
+; CHECK-NEXT: bic r4, r4, #7
; CHECK-NEXT: mul lr, r3, r7
-; CHECK-NEXT: vdup.16 q0, r6
-; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill
-; CHECK-NEXT: vdup.16 q0, lr
; CHECK-NEXT: muls r5, r7, r5
-; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill
-; CHECK-NEXT: vmov.i16 q0, #0xfc
-; CHECK-NEXT: mov.w r6, #2016
-; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
-; CHECK-NEXT: vdup.16 q0, r5
; CHECK-NEXT: rsb.w r3, r7, #256
; CHECK-NEXT: lsls r7, r1, #1
-; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT: vdup.16 q0, r6
+; CHECK-NEXT: sub.w r1, r4, #8
+; CHECK-NEXT: movs r4, #1
; CHECK-NEXT: vmov.i16 q2, #0xf8
-; CHECK-NEXT: vmov.i16 q5, #0x78
-; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
-; CHECK-NEXT: vmov.i16 q6, #0xf800
+; CHECK-NEXT: add.w r1, r4, r1, lsr #3
+; CHECK-NEXT: vdup.16 q6, r6
+; CHECK-NEXT: mov.w r6, #2016
; CHECK-NEXT: movs r4, #0
-; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT: vdup.16 q3, lr
+; CHECK-NEXT: vdup.16 q5, r5
+; CHECK-NEXT: vdup.16 q7, r6
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: .LBB1_3: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB1_4 Depth 2
; CHECK-NEXT: mov r5, r0
-; CHECK-NEXT: dlstp.16 lr, r2
+; CHECK-NEXT: mov r6, r2
+; CHECK-NEXT: dls lr, r1
; CHECK-NEXT: .p2align 2
; CHECK-NEXT: .LBB1_4: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB1_3 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: vldrh.u16 q0, [r5]
+; CHECK-NEXT: vctp.16 r6
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vldrht.u16 q0, [r5]
; CHECK-NEXT: vshl.i16 q1, q0, #3
-; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT: subs r6, #8
; CHECK-NEXT: vand q1, q1, q2
-; CHECK-NEXT: vmla.i16 q3, q1, r3
-; CHECK-NEXT: vmov.f64 d8, d4
-; CHECK-NEXT: vmov.f64 d9, d5
-; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT: vshr.u16 q2, q0, #9
+; CHECK-NEXT: vmov.i16 q2, #0x78
+; CHECK-NEXT: vshr.u16 q4, q0, #9
+; CHECK-NEXT: vand q4, q4, q2
+; CHECK-NEXT: vmov q2, q6
+; CHECK-NEXT: vmla.i16 q2, q1, r3
; CHECK-NEXT: vshr.u16 q0, q0, #3
+; CHECK-NEXT: vmov.i16 q1, #0xfc
; CHECK-NEXT: vand q0, q0, q1
-; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT: vmov q1, q3
; CHECK-NEXT: vmla.i16 q1, q0, r3
-; CHECK-NEXT: vand q2, q2, q5
-; CHECK-NEXT: vshr.u16 q0, q3, #11
-; CHECK-NEXT: vldrw.u32 q3, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: vshr.u16 q0, q2, #11
+; CHECK-NEXT: vmov q2, q5
+; CHECK-NEXT: vmla.i16 q2, q4, r3
; CHECK-NEXT: vshr.u16 q1, q1, #5
-; CHECK-NEXT: vmla.i16 q3, q2, r3
+; CHECK-NEXT: vmov.i16 q4, #0xf800
; CHECK-NEXT: vand q1, q1, q7
; CHECK-NEXT: vorr q0, q1, q0
-; CHECK-NEXT: vand q1, q3, q6
+; CHECK-NEXT: vand q1, q2, q4
+; CHECK-NEXT: vmov.i16 q2, #0xf8
; CHECK-NEXT: vorr q0, q0, q1
-; CHECK-NEXT: vstrh.16 q0, [r5], #16
-; CHECK-NEXT: vmov.f64 d4, d8
-; CHECK-NEXT: vmov.f64 d5, d9
-; CHECK-NEXT: letp lr, .LBB1_4
+; CHECK-NEXT: vpst
+; CHECK-NEXT: vstrht.16 q0, [r5], #16
+; CHECK-NEXT: le lr, .LBB1_4
; CHECK-NEXT: @ %bb.5: @ %for.cond3.for.cond.cleanup7_crit_edge.us
; CHECK-NEXT: @ in Loop: Header=BB1_3 Depth=1
; CHECK-NEXT: adds r4, #1
@@ -272,7 +266,6 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc
; CHECK-NEXT: cmp r4, r12
; CHECK-NEXT: bne .LBB1_3
; CHECK-NEXT: @ %bb.6:
-; CHECK-NEXT: add sp, #80
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, lr}
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
index 07c06e1..1769c5d 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll
@@ -17,17 +17,16 @@
define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ptr nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr {
; ENABLED-LABEL: varying_outer_2d_reduction:
; ENABLED: @ %bb.0: @ %entry
-; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
-; ENABLED-NEXT: sub sp, #4
; ENABLED-NEXT: cmp r3, #1
-; ENABLED-NEXT: str r0, [sp] @ 4-byte Spill
-; ENABLED-NEXT: blt .LBB0_8
-; ENABLED-NEXT: @ %bb.1: @ %for.body.lr.ph
-; ENABLED-NEXT: ldr r0, [sp, #36]
-; ENABLED-NEXT: add.w r12, r2, #3
-; ENABLED-NEXT: ldr.w r10, [sp] @ 4-byte Reload
-; ENABLED-NEXT: mov.w r8, #0
-; ENABLED-NEXT: mov r9, r12
+; ENABLED-NEXT: it lt
+; ENABLED-NEXT: bxlt lr
+; ENABLED-NEXT: .LBB0_1: @ %for.body.lr.ph
+; ENABLED-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; ENABLED-NEXT: mov r11, r0
+; ENABLED-NEXT: ldr r0, [sp, #32]
+; ENABLED-NEXT: add.w r9, r2, #3
+; ENABLED-NEXT: mov.w r12, #0
+; ENABLED-NEXT: mov r10, r11
; ENABLED-NEXT: uxth r0, r0
; ENABLED-NEXT: rsbs r5, r0, #0
; ENABLED-NEXT: b .LBB0_4
@@ -37,31 +36,32 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1
; ENABLED-NEXT: lsrs r0, r0, #16
; ENABLED-NEXT: sub.w r9, r9, #1
-; ENABLED-NEXT: strh.w r0, [r1, r8, lsl #1]
-; ENABLED-NEXT: add.w r8, r8, #1
+; ENABLED-NEXT: strh.w r0, [r1, r12, lsl #1]
+; ENABLED-NEXT: add.w r12, r12, #1
; ENABLED-NEXT: add.w r10, r10, #2
-; ENABLED-NEXT: cmp r8, r3
+; ENABLED-NEXT: cmp r12, r3
; ENABLED-NEXT: beq .LBB0_8
; ENABLED-NEXT: .LBB0_4: @ %for.body
; ENABLED-NEXT: @ =>This Loop Header: Depth=1
; ENABLED-NEXT: @ Child Loop BB0_6 Depth 2
-; ENABLED-NEXT: cmp r2, r8
+; ENABLED-NEXT: cmp r2, r12
; ENABLED-NEXT: ble .LBB0_2
; ENABLED-NEXT: @ %bb.5: @ %vector.ph
; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1
; ENABLED-NEXT: bic r0, r9, #3
; ENABLED-NEXT: movs r7, #1
; ENABLED-NEXT: subs r0, #4
-; ENABLED-NEXT: sub.w r4, r2, r8
+; ENABLED-NEXT: sub.w r4, r2, r12
; ENABLED-NEXT: vmov.i32 q1, #0x0
; ENABLED-NEXT: add.w r6, r7, r0, lsr #2
-; ENABLED-NEXT: sub.w r0, r12, r8
+; ENABLED-NEXT: adds r0, r2, #3
+; ENABLED-NEXT: sub.w r0, r0, r12
; ENABLED-NEXT: bic r0, r0, #3
; ENABLED-NEXT: subs r0, #4
; ENABLED-NEXT: add.w r0, r7, r0, lsr #2
; ENABLED-NEXT: mov r7, r10
; ENABLED-NEXT: dls lr, r0
-; ENABLED-NEXT: ldr r0, [sp] @ 4-byte Reload
+; ENABLED-NEXT: mov r0, r11
; ENABLED-NEXT: .LBB0_6: @ %vector.body
; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1
; ENABLED-NEXT: @ => This Inner Loop Header: Depth=2
@@ -82,23 +82,22 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; ENABLED-NEXT: vpsel q0, q1, q0
; ENABLED-NEXT: vaddv.u32 r0, q0
; ENABLED-NEXT: b .LBB0_3
-; ENABLED-NEXT: .LBB0_8: @ %for.end17
-; ENABLED-NEXT: add sp, #4
-; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; ENABLED-NEXT: .LBB0_8:
+; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; ENABLED-NEXT: bx lr
;
; NOREDUCTIONS-LABEL: varying_outer_2d_reduction:
; NOREDUCTIONS: @ %bb.0: @ %entry
-; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
-; NOREDUCTIONS-NEXT: sub sp, #4
; NOREDUCTIONS-NEXT: cmp r3, #1
-; NOREDUCTIONS-NEXT: str r0, [sp] @ 4-byte Spill
-; NOREDUCTIONS-NEXT: blt .LBB0_8
-; NOREDUCTIONS-NEXT: @ %bb.1: @ %for.body.lr.ph
-; NOREDUCTIONS-NEXT: ldr r0, [sp, #36]
-; NOREDUCTIONS-NEXT: add.w r12, r2, #3
-; NOREDUCTIONS-NEXT: ldr.w r10, [sp] @ 4-byte Reload
-; NOREDUCTIONS-NEXT: mov.w r8, #0
-; NOREDUCTIONS-NEXT: mov r9, r12
+; NOREDUCTIONS-NEXT: it lt
+; NOREDUCTIONS-NEXT: bxlt lr
+; NOREDUCTIONS-NEXT: .LBB0_1: @ %for.body.lr.ph
+; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; NOREDUCTIONS-NEXT: mov r11, r0
+; NOREDUCTIONS-NEXT: ldr r0, [sp, #32]
+; NOREDUCTIONS-NEXT: add.w r9, r2, #3
+; NOREDUCTIONS-NEXT: mov.w r12, #0
+; NOREDUCTIONS-NEXT: mov r10, r11
; NOREDUCTIONS-NEXT: uxth r0, r0
; NOREDUCTIONS-NEXT: rsbs r5, r0, #0
; NOREDUCTIONS-NEXT: b .LBB0_4
@@ -108,31 +107,32 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1
; NOREDUCTIONS-NEXT: lsrs r0, r0, #16
; NOREDUCTIONS-NEXT: sub.w r9, r9, #1
-; NOREDUCTIONS-NEXT: strh.w r0, [r1, r8, lsl #1]
-; NOREDUCTIONS-NEXT: add.w r8, r8, #1
+; NOREDUCTIONS-NEXT: strh.w r0, [r1, r12, lsl #1]
+; NOREDUCTIONS-NEXT: add.w r12, r12, #1
; NOREDUCTIONS-NEXT: add.w r10, r10, #2
-; NOREDUCTIONS-NEXT: cmp r8, r3
+; NOREDUCTIONS-NEXT: cmp r12, r3
; NOREDUCTIONS-NEXT: beq .LBB0_8
; NOREDUCTIONS-NEXT: .LBB0_4: @ %for.body
; NOREDUCTIONS-NEXT: @ =>This Loop Header: Depth=1
; NOREDUCTIONS-NEXT: @ Child Loop BB0_6 Depth 2
-; NOREDUCTIONS-NEXT: cmp r2, r8
+; NOREDUCTIONS-NEXT: cmp r2, r12
; NOREDUCTIONS-NEXT: ble .LBB0_2
; NOREDUCTIONS-NEXT: @ %bb.5: @ %vector.ph
; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1
; NOREDUCTIONS-NEXT: bic r0, r9, #3
; NOREDUCTIONS-NEXT: movs r7, #1
; NOREDUCTIONS-NEXT: subs r0, #4
-; NOREDUCTIONS-NEXT: sub.w r4, r2, r8
+; NOREDUCTIONS-NEXT: sub.w r4, r2, r12
; NOREDUCTIONS-NEXT: vmov.i32 q1, #0x0
; NOREDUCTIONS-NEXT: add.w r6, r7, r0, lsr #2
-; NOREDUCTIONS-NEXT: sub.w r0, r12, r8
+; NOREDUCTIONS-NEXT: adds r0, r2, #3
+; NOREDUCTIONS-NEXT: sub.w r0, r0, r12
; NOREDUCTIONS-NEXT: bic r0, r0, #3
; NOREDUCTIONS-NEXT: subs r0, #4
; NOREDUCTIONS-NEXT: add.w r0, r7, r0, lsr #2
; NOREDUCTIONS-NEXT: mov r7, r10
; NOREDUCTIONS-NEXT: dls lr, r0
-; NOREDUCTIONS-NEXT: ldr r0, [sp] @ 4-byte Reload
+; NOREDUCTIONS-NEXT: mov r0, r11
; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body
; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1
; NOREDUCTIONS-NEXT: @ => This Inner Loop Header: Depth=2
@@ -153,9 +153,9 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input,
; NOREDUCTIONS-NEXT: vpsel q0, q1, q0
; NOREDUCTIONS-NEXT: vaddv.u32 r0, q0
; NOREDUCTIONS-NEXT: b .LBB0_3
-; NOREDUCTIONS-NEXT: .LBB0_8: @ %for.end17
-; NOREDUCTIONS-NEXT: add sp, #4
-; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc}
+; NOREDUCTIONS-NEXT: .LBB0_8:
+; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr}
+; NOREDUCTIONS-NEXT: bx lr
entry:
%conv = sext i16 %N to i32
%cmp36 = icmp sgt i16 %N, 0
diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
index e0a61b1..78dc35b 100644
--- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
+++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll
@@ -49,18 +49,17 @@ define i32 @vcmp_new_vpst_combination(i32 %len, ptr nocapture readonly %arr) {
; CHECK-NEXT: cmp r0, #1
; CHECK-NEXT: blt .LBB1_4
; CHECK-NEXT: @ %bb.1: @ %vector.ph
-; CHECK-NEXT: vmov.i32 q0, #0x0
-; CHECK-NEXT: vmov.i32 q1, #0x1
+; CHECK-NEXT: vmov.i32 q0, #0x1
; CHECK-NEXT: movs r2, #0
; CHECK-NEXT: dlstp.32 lr, r0
; CHECK-NEXT: .LBB1_2: @ %vector.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrw.u32 q2, [r1], #16
-; CHECK-NEXT: vcmp.i32 ne, q2, zr
-; CHECK-NEXT: vmov q2, q0
+; CHECK-NEXT: vldrw.u32 q1, [r1], #16
+; CHECK-NEXT: vcmp.i32 ne, q1, zr
+; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: vpst
-; CHECK-NEXT: vmovt q2, q1
-; CHECK-NEXT: vaddva.u32 r2, q2
+; CHECK-NEXT: vmovt q1, q0
+; CHECK-NEXT: vaddva.u32 r2, q1
; CHECK-NEXT: letp lr, .LBB1_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: mov r0, r2
diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
index c8dd949..a904347 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll
@@ -993,10 +993,10 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr}
; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
-; CHECK-NEXT: .pad #24
-; CHECK-NEXT: sub sp, #24
+; CHECK-NEXT: .pad #20
+; CHECK-NEXT: sub sp, #20
; CHECK-NEXT: cmp r3, #8
-; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: blo.w .LBB16_12
; CHECK-NEXT: @ %bb.1: @ %if.then
; CHECK-NEXT: lsrs.w r12, r3, #2
@@ -1016,50 +1016,48 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
; CHECK-NEXT: subs r1, r7, #2
; CHECK-NEXT: rsbs r7, r4, #0
-; CHECK-NEXT: str r7, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT: add.w r7, r3, #16
-; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: str r4, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: b .LBB16_6
; CHECK-NEXT: .LBB16_3: @ %while.end.loopexit
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT: add.w r5, r5, r0, lsl #1
+; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: add.w r6, r6, r0, lsl #1
; CHECK-NEXT: b .LBB16_5
; CHECK-NEXT: .LBB16_4: @ %for.end
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: wls lr, r0, .LBB16_5
; CHECK-NEXT: b .LBB16_10
; CHECK-NEXT: .LBB16_5: @ %while.end
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: subs.w r12, r12, #1
; CHECK-NEXT: vstrb.8 q0, [r2], #8
-; CHECK-NEXT: add.w r0, r5, r0, lsl #1
+; CHECK-NEXT: add.w r0, r6, r0, lsl #1
; CHECK-NEXT: add.w r5, r0, #8
; CHECK-NEXT: beq.w .LBB16_12
; CHECK-NEXT: .LBB16_6: @ %while.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB16_8 Depth 2
; CHECK-NEXT: @ Child Loop BB16_11 Depth 2
-; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: ldrh.w lr, [r3, #14]
; CHECK-NEXT: vldrw.u32 q0, [r0], #8
-; CHECK-NEXT: ldrh.w r8, [r3, #12]
+; CHECK-NEXT: ldrh.w r10, [r3, #12]
; CHECK-NEXT: ldrh r7, [r3, #10]
; CHECK-NEXT: ldrh r4, [r3, #8]
; CHECK-NEXT: ldrh r6, [r3, #6]
; CHECK-NEXT: ldrh.w r9, [r3, #4]
; CHECK-NEXT: ldrh.w r11, [r3, #2]
-; CHECK-NEXT: ldrh.w r10, [r3]
+; CHECK-NEXT: ldrh.w r8, [r3]
; CHECK-NEXT: vstrb.8 q0, [r1], #8
; CHECK-NEXT: vldrw.u32 q0, [r5]
-; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: adds r0, r5, #2
; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: vmul.f16 q0, q0, r10
+; CHECK-NEXT: vmul.f16 q0, q0, r8
; CHECK-NEXT: adds r0, r5, #6
; CHECK-NEXT: vfma.f16 q0, q1, r11
; CHECK-NEXT: vldrw.u32 q1, [r5, #4]
@@ -1068,73 +1066,73 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: add.w r0, r5, #10
; CHECK-NEXT: vfma.f16 q0, q1, r6
; CHECK-NEXT: vldrw.u32 q1, [r5, #8]
+; CHECK-NEXT: add.w r6, r5, #16
; CHECK-NEXT: vfma.f16 q0, q1, r4
; CHECK-NEXT: vldrw.u32 q1, [r0]
; CHECK-NEXT: add.w r0, r5, #14
; CHECK-NEXT: vfma.f16 q0, q1, r7
; CHECK-NEXT: vldrw.u32 q1, [r5, #12]
-; CHECK-NEXT: adds r5, #16
-; CHECK-NEXT: vfma.f16 q0, q1, r8
+; CHECK-NEXT: vfma.f16 q0, q1, r10
; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: vfma.f16 q0, q1, lr
; CHECK-NEXT: cmp r0, #16
; CHECK-NEXT: blo .LBB16_9
; CHECK-NEXT: @ %bb.7: @ %for.body.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload
+; CHECK-NEXT: add.w r5, r3, #16
; CHECK-NEXT: dls lr, r0
-; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: .LBB16_8: @ %for.body
; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: ldrh r0, [r6], #16
-; CHECK-NEXT: vldrw.u32 q1, [r5]
-; CHECK-NEXT: adds r4, r5, #2
+; CHECK-NEXT: ldrh r0, [r5], #16
+; CHECK-NEXT: vldrw.u32 q1, [r6]
+; CHECK-NEXT: adds r4, r6, #2
; CHECK-NEXT: vfma.f16 q0, q1, r0
; CHECK-NEXT: vldrw.u32 q1, [r4]
-; CHECK-NEXT: ldrh r0, [r6, #-14]
-; CHECK-NEXT: adds r4, r5, #6
+; CHECK-NEXT: ldrh r0, [r5, #-14]
+; CHECK-NEXT: adds r4, r6, #6
; CHECK-NEXT: vfma.f16 q0, q1, r0
-; CHECK-NEXT: ldrh r0, [r6, #-12]
-; CHECK-NEXT: vldrw.u32 q1, [r5, #4]
+; CHECK-NEXT: ldrh r0, [r5, #-12]
+; CHECK-NEXT: vldrw.u32 q1, [r6, #4]
; CHECK-NEXT: vfma.f16 q0, q1, r0
; CHECK-NEXT: vldrw.u32 q1, [r4]
-; CHECK-NEXT: ldrh r0, [r6, #-10]
-; CHECK-NEXT: add.w r4, r5, #10
+; CHECK-NEXT: ldrh r0, [r5, #-10]
+; CHECK-NEXT: add.w r4, r6, #10
; CHECK-NEXT: vfma.f16 q0, q1, r0
-; CHECK-NEXT: ldrh r0, [r6, #-8]
-; CHECK-NEXT: vldrw.u32 q1, [r5, #8]
+; CHECK-NEXT: ldrh r0, [r5, #-8]
+; CHECK-NEXT: vldrw.u32 q1, [r6, #8]
; CHECK-NEXT: vfma.f16 q0, q1, r0
; CHECK-NEXT: vldrw.u32 q1, [r4]
-; CHECK-NEXT: ldrh r0, [r6, #-6]
-; CHECK-NEXT: ldrh r4, [r6, #-2]
+; CHECK-NEXT: ldrh r0, [r5, #-6]
+; CHECK-NEXT: ldrh r4, [r5, #-2]
; CHECK-NEXT: vfma.f16 q0, q1, r0
-; CHECK-NEXT: ldrh r0, [r6, #-4]
-; CHECK-NEXT: vldrw.u32 q1, [r5, #12]
+; CHECK-NEXT: ldrh r0, [r5, #-4]
+; CHECK-NEXT: vldrw.u32 q1, [r6, #12]
; CHECK-NEXT: vfma.f16 q0, q1, r0
-; CHECK-NEXT: add.w r0, r5, #14
+; CHECK-NEXT: add.w r0, r6, #14
; CHECK-NEXT: vldrw.u32 q1, [r0]
-; CHECK-NEXT: adds r5, #16
+; CHECK-NEXT: adds r6, #16
; CHECK-NEXT: vfma.f16 q0, q1, r4
; CHECK-NEXT: le lr, .LBB16_8
; CHECK-NEXT: b .LBB16_4
; CHECK-NEXT: .LBB16_9: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: add.w r5, r3, #16
; CHECK-NEXT: b .LBB16_4
; CHECK-NEXT: .LBB16_10: @ %while.body76.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT: mov r0, r5
+; CHECK-NEXT: mov r0, r6
; CHECK-NEXT: .LBB16_11: @ %while.body76
; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: ldrh r4, [r6], #2
+; CHECK-NEXT: ldrh r4, [r5], #2
; CHECK-NEXT: vldrh.u16 q1, [r0], #2
; CHECK-NEXT: vfma.f16 q0, q1, r4
; CHECK-NEXT: le lr, .LBB16_11
; CHECK-NEXT: b .LBB16_3
; CHECK-NEXT: .LBB16_12: @ %if.end
-; CHECK-NEXT: add sp, #24
+; CHECK-NEXT: add sp, #20
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
entry:
%pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 1
diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
index 28166e4..f7b4548 100644
--- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll
@@ -995,46 +995,44 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: .pad #32
-; CHECK-NEXT: sub sp, #32
+; CHECK-NEXT: .pad #24
+; CHECK-NEXT: sub sp, #24
; CHECK-NEXT: ldrh r6, [r0]
-; CHECK-NEXT: movs r5, #1
-; CHECK-NEXT: ldrd r4, r10, [r0, #4]
+; CHECK-NEXT: movs r4, #1
+; CHECK-NEXT: ldrd r7, r10, [r0, #4]
; CHECK-NEXT: sub.w r0, r6, #8
; CHECK-NEXT: add.w r3, r0, r0, lsr #29
; CHECK-NEXT: and r0, r0, #7
-; CHECK-NEXT: asrs r7, r3, #3
-; CHECK-NEXT: cmp r7, #1
+; CHECK-NEXT: asrs r5, r3, #3
+; CHECK-NEXT: cmp r5, #1
; CHECK-NEXT: it gt
-; CHECK-NEXT: asrgt r5, r3, #3
-; CHECK-NEXT: add.w r3, r4, r6, lsl #2
+; CHECK-NEXT: asrgt r4, r3, #3
+; CHECK-NEXT: add.w r3, r7, r6, lsl #2
; CHECK-NEXT: sub.w r9, r3, #4
; CHECK-NEXT: rsbs r3, r6, #0
-; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
-; CHECK-NEXT: add.w r3, r10, #32
-; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: str r6, [sp, #16] @ 4-byte Spill
-; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: str r4, [sp] @ 4-byte Spill
+; CHECK-NEXT: str r6, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: b .LBB16_6
; CHECK-NEXT: .LBB16_3: @ %while.end.loopexit
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT: add.w r4, r4, r0, lsl #2
+; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: add.w r7, r7, r0, lsl #2
; CHECK-NEXT: b .LBB16_5
; CHECK-NEXT: .LBB16_4: @ %for.end
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
-; CHECK-NEXT: ldrd r0, r9, [sp, #20] @ 8-byte Folded Reload
+; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: ldrd r0, r9, [sp, #12] @ 8-byte Folded Reload
; CHECK-NEXT: wls lr, r0, .LBB16_5
; CHECK-NEXT: b .LBB16_10
; CHECK-NEXT: .LBB16_5: @ %while.end
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: subs.w r12, r12, #1
; CHECK-NEXT: vstrb.8 q0, [r2], #16
-; CHECK-NEXT: add.w r0, r4, r0, lsl #2
-; CHECK-NEXT: add.w r4, r0, #16
+; CHECK-NEXT: add.w r0, r7, r0, lsl #2
+; CHECK-NEXT: add.w r7, r0, #16
; CHECK-NEXT: beq .LBB16_12
; CHECK-NEXT: .LBB16_6: @ %while.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
@@ -1042,76 +1040,76 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no
; CHECK-NEXT: @ Child Loop BB16_11 Depth 2
; CHECK-NEXT: add.w lr, r10, #8
; CHECK-NEXT: vldrw.u32 q0, [r1], #16
-; CHECK-NEXT: ldrd r3, r7, [r10]
+; CHECK-NEXT: ldrd r3, r4, [r10]
; CHECK-NEXT: ldm.w lr, {r0, r5, r6, lr}
; CHECK-NEXT: ldrd r11, r8, [r10, #24]
; CHECK-NEXT: vstrb.8 q0, [r9], #16
-; CHECK-NEXT: vldrw.u32 q0, [r4], #32
-; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT: str.w r9, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT: vldrw.u32 q1, [r4, #-28]
+; CHECK-NEXT: vldrw.u32 q0, [r7], #32
+; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: str.w r9, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: vldrw.u32 q1, [r7, #-28]
; CHECK-NEXT: vmul.f32 q0, q0, r3
-; CHECK-NEXT: vldrw.u32 q6, [r4, #-24]
-; CHECK-NEXT: vldrw.u32 q4, [r4, #-20]
-; CHECK-NEXT: vfma.f32 q0, q1, r7
-; CHECK-NEXT: vldrw.u32 q5, [r4, #-16]
+; CHECK-NEXT: vldrw.u32 q6, [r7, #-24]
+; CHECK-NEXT: vldrw.u32 q4, [r7, #-20]
+; CHECK-NEXT: vfma.f32 q0, q1, r4
+; CHECK-NEXT: vldrw.u32 q5, [r7, #-16]
; CHECK-NEXT: vfma.f32 q0, q6, r0
-; CHECK-NEXT: vldrw.u32 q2, [r4, #-12]
+; CHECK-NEXT: vldrw.u32 q2, [r7, #-12]
; CHECK-NEXT: vfma.f32 q0, q4, r5
-; CHECK-NEXT: vldrw.u32 q3, [r4, #-8]
+; CHECK-NEXT: vldrw.u32 q3, [r7, #-8]
; CHECK-NEXT: vfma.f32 q0, q5, r6
-; CHECK-NEXT: vldrw.u32 q1, [r4, #-4]
+; CHECK-NEXT: vldrw.u32 q1, [r7, #-4]
; CHECK-NEXT: vfma.f32 q0, q2, lr
-; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: vfma.f32 q0, q3, r11
; CHECK-NEXT: vfma.f32 q0, q1, r8
; CHECK-NEXT: cmp r0, #16
; CHECK-NEXT: blo .LBB16_9
; CHECK-NEXT: @ %bb.7: @ %for.body.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload
+; CHECK-NEXT: add.w r4, r10, #32
; CHECK-NEXT: dls lr, r0
-; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: .LBB16_8: @ %for.body
; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: ldm.w r7, {r0, r3, r5, r6, r8, r11}
-; CHECK-NEXT: vldrw.u32 q1, [r4], #32
-; CHECK-NEXT: vldrw.u32 q6, [r4, #-24]
-; CHECK-NEXT: vldrw.u32 q4, [r4, #-20]
+; CHECK-NEXT: ldm.w r4, {r0, r3, r5, r6, r8, r11}
+; CHECK-NEXT: vldrw.u32 q1, [r7], #32
+; CHECK-NEXT: vldrw.u32 q6, [r7, #-24]
+; CHECK-NEXT: vldrw.u32 q4, [r7, #-20]
; CHECK-NEXT: vfma.f32 q0, q1, r0
-; CHECK-NEXT: vldrw.u32 q1, [r4, #-28]
-; CHECK-NEXT: vldrw.u32 q5, [r4, #-16]
-; CHECK-NEXT: vldrw.u32 q2, [r4, #-12]
+; CHECK-NEXT: vldrw.u32 q1, [r7, #-28]
+; CHECK-NEXT: vldrw.u32 q5, [r7, #-16]
+; CHECK-NEXT: vldrw.u32 q2, [r7, #-12]
; CHECK-NEXT: vfma.f32 q0, q1, r3
-; CHECK-NEXT: ldrd r9, r1, [r7, #24]
+; CHECK-NEXT: ldrd r9, r1, [r4, #24]
; CHECK-NEXT: vfma.f32 q0, q6, r5
-; CHECK-NEXT: vldrw.u32 q3, [r4, #-8]
+; CHECK-NEXT: vldrw.u32 q3, [r7, #-8]
; CHECK-NEXT: vfma.f32 q0, q4, r6
-; CHECK-NEXT: vldrw.u32 q1, [r4, #-4]
+; CHECK-NEXT: vldrw.u32 q1, [r7, #-4]
; CHECK-NEXT: vfma.f32 q0, q5, r8
-; CHECK-NEXT: adds r7, #32
+; CHECK-NEXT: adds r4, #32
; CHECK-NEXT: vfma.f32 q0, q2, r11
; CHECK-NEXT: vfma.f32 q0, q3, r9
; CHECK-NEXT: vfma.f32 q0, q1, r1
; CHECK-NEXT: le lr, .LBB16_8
; CHECK-NEXT: b .LBB16_4
; CHECK-NEXT: .LBB16_9: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: add.w r4, r10, #32
; CHECK-NEXT: b .LBB16_4
; CHECK-NEXT: .LBB16_10: @ %while.body76.preheader
; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1
-; CHECK-NEXT: mov r3, r4
+; CHECK-NEXT: mov r3, r7
; CHECK-NEXT: .LBB16_11: @ %while.body76
; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: ldr r0, [r7], #4
+; CHECK-NEXT: ldr r0, [r4], #4
; CHECK-NEXT: vldrw.u32 q1, [r3], #4
; CHECK-NEXT: vfma.f32 q0, q1, r0
; CHECK-NEXT: le lr, .LBB16_11
; CHECK-NEXT: b .LBB16_3
; CHECK-NEXT: .LBB16_12:
-; CHECK-NEXT: add sp, #32
+; CHECK-NEXT: add sp, #24
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr}
diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
index e8b49c1..0d86f22 100644
--- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll
@@ -711,8 +711,8 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #136
-; CHECK-NEXT: sub sp, #136
+; CHECK-NEXT: .pad #120
+; CHECK-NEXT: sub sp, #120
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: strd r1, r2, [sp, #64] @ 8-byte Folded Spill
; CHECK-NEXT: blt.w .LBB14_5
@@ -725,22 +725,20 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
; CHECK-NEXT: subs r1, #8
; CHECK-NEXT: vstrw.32 q0, [sp, #40] @ 16-byte Spill
-; CHECK-NEXT: vmov.i16 q2, #0x18
; CHECK-NEXT: add.w r1, r2, r1, lsr #3
; CHECK-NEXT: str r1, [sp, #60] @ 4-byte Spill
; CHECK-NEXT: adr r1, .LCPI14_0
; CHECK-NEXT: adr r2, .LCPI14_1
; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill
; CHECK-NEXT: vstrw.32 q0, [sp, #24] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [r2]
-; CHECK-NEXT: add r2, sp, #120
+; CHECK-NEXT: add r2, sp, #104
; CHECK-NEXT: vstrw.32 q0, [sp, #8] @ 16-byte Spill
; CHECK-NEXT: .LBB14_2: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB14_3 Depth 2
; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload
-; CHECK-NEXT: add.w r10, sp, #104
+; CHECK-NEXT: add.w r10, sp, #88
; CHECK-NEXT: dls lr, r1
; CHECK-NEXT: ldr r7, [sp, #64] @ 4-byte Reload
; CHECK-NEXT: vldrw.u32 q4, [sp, #24] @ 16-byte Reload
@@ -762,7 +760,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
; CHECK-NEXT: vmov r6, r2, d4
; CHECK-NEXT: ldrh r1, [r1]
; CHECK-NEXT: ldrh.w r12, [r4]
-; CHECK-NEXT: add r4, sp, #88
+; CHECK-NEXT: add r4, sp, #72
; CHECK-NEXT: ldrh.w r11, [r5]
; CHECK-NEXT: ldrh r3, [r3]
; CHECK-NEXT: ldrh r5, [r6]
@@ -807,7 +805,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
; CHECK-NEXT: vmov.16 q3[0], r2
; CHECK-NEXT: vmov.16 q3[1], r5
; CHECK-NEXT: vmov r2, r5, d5
-; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload
+; CHECK-NEXT: vmov.i16 q2, #0x18
; CHECK-NEXT: vadd.i16 q6, q6, q2
; CHECK-NEXT: vadd.i16 q5, q5, q2
; CHECK-NEXT: vadd.i16 q4, q4, q2
@@ -849,7 +847,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read
; CHECK-NEXT: cmp r1, r3
; CHECK-NEXT: bne.w .LBB14_2
; CHECK-NEXT: .LBB14_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #136
+; CHECK-NEXT: add sp, #120
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -950,7 +948,6 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read
; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [r6]
; CHECK-NEXT: adr r6, .LCPI15_9
-; CHECK-NEXT: vmov.i32 q2, #0x30
; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill
; CHECK-NEXT: vldrw.u32 q0, [r7]
; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill
@@ -963,212 +960,213 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read
; CHECK-NEXT: .LBB15_2: @ %vector.ph
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB15_3 Depth 2
+; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload
; CHECK-NEXT: adr r1, .LCPI15_3
-; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT: vldrw.u32 q0, [r1]
-; CHECK-NEXT: adr r1, .LCPI15_4
; CHECK-NEXT: vldrw.u32 q5, [r1]
+; CHECK-NEXT: adr r1, .LCPI15_4
+; CHECK-NEXT: vstrw.32 q2, [sp, #296] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q6, [r1]
; CHECK-NEXT: adr r1, .LCPI15_2
-; CHECK-NEXT: vldrw.u32 q3, [r1]
+; CHECK-NEXT: vstrw.32 q2, [sp, #280] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q1, [r1]
; CHECK-NEXT: adr r1, .LCPI15_10
-; CHECK-NEXT: vstrw.32 q6, [sp, #280] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q6, [sp, #32] @ 16-byte Reload
-; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill
+; CHECK-NEXT: vstrw.32 q2, [sp, #264] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload
; CHECK-NEXT: vldrw.u32 q3, [r1]
; CHECK-NEXT: adr r1, .LCPI15_11
; CHECK-NEXT: ldr.w r8, [sp, #116] @ 4-byte Reload
-; CHECK-NEXT: vstrw.32 q3, [sp, #248] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload
-; CHECK-NEXT: vstrw.32 q6, [sp, #264] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload
-; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT: vstrw.32 q2, [sp, #248] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload
; CHECK-NEXT: vldrw.u32 q7, [r1]
-; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload
-; CHECK-NEXT: vstrw.32 q3, [sp, #200] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q3, [sp, #96] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
; CHECK-NEXT: mov r11, r10
-; CHECK-NEXT: vstrw.32 q6, [sp, #232] @ 16-byte Spill
-; CHECK-NEXT: vstrw.32 q3, [sp, #184] @ 16-byte Spill
+; CHECK-NEXT: vstrw.32 q2, [sp, #232] @ 16-byte Spill
+; CHECK-NEXT: vstrw.32 q0, [sp, #216] @ 16-byte Spill
; CHECK-NEXT: .LBB15_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB15_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: vadd.i32 q4, q1, r0
-; CHECK-NEXT: vstrw.32 q7, [sp, #136] @ 16-byte Spill
-; CHECK-NEXT: vmov r1, lr, d8
-; CHECK-NEXT: vadd.i32 q7, q7, r0
-; CHECK-NEXT: vmov r5, r4, d15
-; CHECK-NEXT: vadd.i32 q6, q0, r0
-; CHECK-NEXT: vmov r6, r7, d13
+; CHECK-NEXT: vmov q0, q7
+; CHECK-NEXT: vstrw.32 q7, [sp, #184] @ 16-byte Spill
+; CHECK-NEXT: vadd.i32 q7, q5, r0
+; CHECK-NEXT: vstrw.32 q5, [sp, #200] @ 16-byte Spill
+; CHECK-NEXT: vadd.i32 q5, q0, r0
+; CHECK-NEXT: vmov q0, q6
+; CHECK-NEXT: vadd.i32 q6, q4, r0
+; CHECK-NEXT: vmov r5, r4, d11
+; CHECK-NEXT: vmov r1, lr, d12
+; CHECK-NEXT: vadd.i32 q2, q1, r0
+; CHECK-NEXT: vmov r6, r7, d15
; CHECK-NEXT: vstrw.32 q1, [sp, #152] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q1, [sp, #296] @ 16-byte Reload
-; CHECK-NEXT: vstrw.32 q0, [sp, #168] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q0, [sp, #248] @ 16-byte Reload
-; CHECK-NEXT: vldrw.u32 q3, [sp, #216] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q1, q1, r0
-; CHECK-NEXT: vstrw.32 q5, [sp, #120] @ 16-byte Spill
-; CHECK-NEXT: vadd.i32 q0, q0, r0
+; CHECK-NEXT: vmov q1, q3
+; CHECK-NEXT: vstrw.32 q4, [sp, #168] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q4, [sp, #248] @ 16-byte Reload
+; CHECK-NEXT: vstrw.32 q0, [sp, #120] @ 16-byte Spill
+; CHECK-NEXT: vstrw.32 q3, [sp, #136] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q3, [sp, #184] @ 16-byte Reload
; CHECK-NEXT: subs.w r11, r11, #16
-; CHECK-NEXT: ldrb.w r9, [r1]
-; CHECK-NEXT: vmov r1, r3, d14
; CHECK-NEXT: ldrb r5, [r5]
+; CHECK-NEXT: ldrb.w r9, [r1]
+; CHECK-NEXT: vmov r1, r3, d10
; CHECK-NEXT: ldrb r7, [r7]
; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.8 q7[0], r1
+; CHECK-NEXT: vmov.8 q5[0], r1
; CHECK-NEXT: ldrb r1, [r3]
-; CHECK-NEXT: vmov.8 q7[1], r1
-; CHECK-NEXT: vmov r1, r3, d12
-; CHECK-NEXT: vmov.8 q7[2], r5
+; CHECK-NEXT: vmov.8 q5[1], r1
+; CHECK-NEXT: vmov r1, r3, d14
+; CHECK-NEXT: vmov.8 q5[2], r5
; CHECK-NEXT: ldrb r5, [r6]
; CHECK-NEXT: ldrb r6, [r4]
-; CHECK-NEXT: vmov.8 q7[3], r6
+; CHECK-NEXT: vmov.8 q5[3], r6
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: ldrb r3, [r3]
-; CHECK-NEXT: vmov.8 q6[0], r1
-; CHECK-NEXT: vmov r6, r1, d2
-; CHECK-NEXT: vmov.8 q6[1], r3
-; CHECK-NEXT: vmov.8 q6[2], r5
-; CHECK-NEXT: vmov.8 q6[3], r7
+; CHECK-NEXT: vmov.8 q7[0], r1
+; CHECK-NEXT: vmov r6, r1, d4
+; CHECK-NEXT: vmov.8 q7[1], r3
+; CHECK-NEXT: vmov.8 q7[2], r5
+; CHECK-NEXT: vmov.8 q7[3], r7
; CHECK-NEXT: ldrb.w r7, [lr]
-; CHECK-NEXT: vmov.8 q6[4], r9
-; CHECK-NEXT: vmov.8 q6[5], r7
+; CHECK-NEXT: vmov.8 q7[4], r9
+; CHECK-NEXT: vmov.8 q7[5], r7
; CHECK-NEXT: ldrb r4, [r1]
-; CHECK-NEXT: vmov r1, r5, d3
-; CHECK-NEXT: vldrw.u32 q1, [sp, #232] @ 16-byte Reload
+; CHECK-NEXT: vmov r1, r5, d5
+; CHECK-NEXT: vadd.i32 q2, q1, r0
+; CHECK-NEXT: vldrw.u32 q1, [sp, #280] @ 16-byte Reload
; CHECK-NEXT: ldrb.w r12, [r1]
-; CHECK-NEXT: vmov r1, r3, d9
+; CHECK-NEXT: vmov r1, r3, d13
; CHECK-NEXT: ldrb r5, [r5]
-; CHECK-NEXT: vldrw.u32 q4, [sp, #184] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q6, [sp, #232] @ 16-byte Reload
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: ldrb r3, [r3]
-; CHECK-NEXT: vmov.8 q6[6], r1
-; CHECK-NEXT: vmov r1, r7, d0
-; CHECK-NEXT: vmov.8 q6[7], r3
+; CHECK-NEXT: vmov.8 q7[6], r1
+; CHECK-NEXT: vmov r1, r7, d4
+; CHECK-NEXT: vmov.8 q7[7], r3
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: ldrb r7, [r7]
-; CHECK-NEXT: vmov.8 q7[4], r1
-; CHECK-NEXT: vmov r1, r3, d1
-; CHECK-NEXT: vldrw.u32 q0, [sp, #264] @ 16-byte Reload
-; CHECK-NEXT: vmov.8 q7[5], r7
-; CHECK-NEXT: vadd.i32 q0, q0, r0
+; CHECK-NEXT: vmov.8 q5[4], r1
+; CHECK-NEXT: vmov r1, r3, d5
+; CHECK-NEXT: vmov.8 q5[5], r7
+; CHECK-NEXT: vadd.i32 q2, q1, r0
+; CHECK-NEXT: vldrw.u32 q1, [sp, #296] @ 16-byte Reload
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: ldrb r3, [r3]
-; CHECK-NEXT: vmov.8 q7[6], r1
+; CHECK-NEXT: vmov.8 q5[6], r1
; CHECK-NEXT: ldrb r1, [r6]
-; CHECK-NEXT: vmov r7, r6, d0
-; CHECK-NEXT: vmov.8 q7[7], r3
-; CHECK-NEXT: vmov r3, lr, d1
-; CHECK-NEXT: vldrw.u32 q0, [sp, #280] @ 16-byte Reload
-; CHECK-NEXT: vmov.8 q7[8], r1
-; CHECK-NEXT: vadd.i32 q0, q0, r0
-; CHECK-NEXT: vmov.8 q7[9], r4
-; CHECK-NEXT: vmov r4, r1, d0
-; CHECK-NEXT: vmov.8 q7[10], r12
-; CHECK-NEXT: vmov.8 q7[11], r5
+; CHECK-NEXT: vmov.8 q5[7], r3
+; CHECK-NEXT: vmov r7, r6, d4
+; CHECK-NEXT: vmov r3, lr, d5
+; CHECK-NEXT: vmov.8 q5[8], r1
+; CHECK-NEXT: vadd.i32 q2, q1, r0
+; CHECK-NEXT: vmov.8 q5[9], r4
+; CHECK-NEXT: vmov r4, r1, d4
+; CHECK-NEXT: vmov.8 q5[10], r12
+; CHECK-NEXT: vmov.8 q5[11], r5
+; CHECK-NEXT: vldrw.u32 q1, [sp, #264] @ 16-byte Reload
; CHECK-NEXT: ldrb r7, [r7]
; CHECK-NEXT: ldrb r6, [r6]
; CHECK-NEXT: ldrb r3, [r3]
; CHECK-NEXT: ldrb r4, [r4]
; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.8 q6[8], r4
-; CHECK-NEXT: vmov r5, r4, d1
-; CHECK-NEXT: vmov.8 q6[9], r1
-; CHECK-NEXT: vadd.i32 q0, q5, r0
-; CHECK-NEXT: vldrw.u32 q5, [sp, #200] @ 16-byte Reload
+; CHECK-NEXT: vmov.8 q7[8], r4
+; CHECK-NEXT: vmov r5, r4, d5
+; CHECK-NEXT: vmov.8 q7[9], r1
+; CHECK-NEXT: vadd.i32 q2, q0, r0
+; CHECK-NEXT: vldrw.u32 q0, [sp, #216] @ 16-byte Reload
; CHECK-NEXT: ldrb r5, [r5]
; CHECK-NEXT: ldrb r4, [r4]
-; CHECK-NEXT: vmov.8 q6[10], r5
-; CHECK-NEXT: vmov.8 q6[11], r4
-; CHECK-NEXT: vmov.8 q6[12], r7
-; CHECK-NEXT: vmov.8 q6[13], r6
-; CHECK-NEXT: vmov.8 q6[14], r3
-; CHECK-NEXT: vmov r1, r3, d0
+; CHECK-NEXT: vmov.8 q7[10], r5
+; CHECK-NEXT: vmov.8 q7[11], r4
+; CHECK-NEXT: vmov.8 q7[12], r7
+; CHECK-NEXT: vmov.8 q7[13], r6
+; CHECK-NEXT: vmov.8 q7[14], r3
+; CHECK-NEXT: vmov r1, r3, d4
; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.8 q7[12], r1
+; CHECK-NEXT: vmov.8 q5[12], r1
; CHECK-NEXT: ldrb r1, [r3]
-; CHECK-NEXT: vmov.8 q7[13], r1
-; CHECK-NEXT: vmov r1, r3, d1
-; CHECK-NEXT: vadd.i32 q0, q1, r0
-; CHECK-NEXT: vadd.i32 q1, q1, q2
-; CHECK-NEXT: vstrw.32 q1, [sp, #232] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q1, [sp, #248] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q1, q1, q2
-; CHECK-NEXT: vstrw.32 q1, [sp, #248] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q1, [sp, #152] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q1, q1, q2
+; CHECK-NEXT: vmov.8 q5[13], r1
+; CHECK-NEXT: vmov r1, r3, d5
+; CHECK-NEXT: vadd.i32 q2, q1, r0
; CHECK-NEXT: ldrb r1, [r1]
-; CHECK-NEXT: vmov.8 q7[14], r1
+; CHECK-NEXT: vmov.8 q5[14], r1
; CHECK-NEXT: ldrb r1, [r3]
-; CHECK-NEXT: vmov.8 q7[15], r1
+; CHECK-NEXT: vmov.8 q5[15], r1
; CHECK-NEXT: ldrb.w r1, [lr]
-; CHECK-NEXT: vmov.8 q6[15], r1
-; CHECK-NEXT: vmov r1, r3, d0
-; CHECK-NEXT: vadd.i8 q6, q6, q7
+; CHECK-NEXT: vmov.8 q7[15], r1
+; CHECK-NEXT: vmov r1, r3, d4
+; CHECK-NEXT: vadd.i8 q5, q7, q5
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: ldrb r3, [r3]
; CHECK-NEXT: vmov.8 q7[0], r1
; CHECK-NEXT: vmov.8 q7[1], r3
-; CHECK-NEXT: vmov r1, r3, d1
-; CHECK-NEXT: vadd.i32 q0, q3, r0
-; CHECK-NEXT: vadd.i32 q3, q3, q2
-; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q3, [sp, #296] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q3, q3, q2
-; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q3, [sp, #280] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q3, q3, q2
-; CHECK-NEXT: vstrw.32 q3, [sp, #280] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q3, [sp, #264] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q3, q3, q2
-; CHECK-NEXT: vstrw.32 q3, [sp, #264] @ 16-byte Spill
+; CHECK-NEXT: vmov r1, r3, d5
+; CHECK-NEXT: vadd.i32 q2, q4, r0
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[2], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[3], r1
-; CHECK-NEXT: vmov r1, r3, d0
+; CHECK-NEXT: vmov r1, r3, d4
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[4], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[5], r1
-; CHECK-NEXT: vmov r1, r3, d1
-; CHECK-NEXT: vadd.i32 q0, q5, r0
-; CHECK-NEXT: vadd.i32 q5, q5, q2
-; CHECK-NEXT: vstrw.32 q5, [sp, #200] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q5, [sp, #120] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q5, q5, q2
+; CHECK-NEXT: vmov r1, r3, d5
+; CHECK-NEXT: vadd.i32 q2, q6, r0
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[6], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[7], r1
-; CHECK-NEXT: vmov r1, r3, d0
+; CHECK-NEXT: vmov r1, r3, d4
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[8], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[9], r1
-; CHECK-NEXT: vmov r1, r3, d1
-; CHECK-NEXT: vadd.i32 q0, q4, r0
-; CHECK-NEXT: vadd.i32 q4, q4, q2
-; CHECK-NEXT: vstrw.32 q4, [sp, #184] @ 16-byte Spill
+; CHECK-NEXT: vmov r1, r3, d5
+; CHECK-NEXT: vadd.i32 q2, q0, r0
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[10], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[11], r1
-; CHECK-NEXT: vmov r1, r3, d0
+; CHECK-NEXT: vmov r1, r3, d4
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[12], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[13], r1
-; CHECK-NEXT: vmov r1, r3, d1
+; CHECK-NEXT: vmov r1, r3, d5
; CHECK-NEXT: ldrb r1, [r1]
; CHECK-NEXT: vmov.8 q7[14], r1
; CHECK-NEXT: ldrb r1, [r3]
; CHECK-NEXT: vmov.8 q7[15], r1
-; CHECK-NEXT: vadd.i8 q0, q6, q7
-; CHECK-NEXT: vldrw.u32 q7, [sp, #136] @ 16-byte Reload
-; CHECK-NEXT: vstrb.8 q0, [r8], #16
-; CHECK-NEXT: vldrw.u32 q0, [sp, #168] @ 16-byte Reload
-; CHECK-NEXT: vadd.i32 q7, q7, q2
+; CHECK-NEXT: vadd.i8 q2, q5, q7
+; CHECK-NEXT: vldrw.u32 q5, [sp, #200] @ 16-byte Reload
+; CHECK-NEXT: vstrb.8 q2, [r8], #16
+; CHECK-NEXT: vmov.i32 q2, #0x30
+; CHECK-NEXT: vadd.i32 q6, q6, q2
+; CHECK-NEXT: vadd.i32 q3, q3, q2
+; CHECK-NEXT: vstrw.32 q6, [sp, #232] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q6, [sp, #296] @ 16-byte Reload
+; CHECK-NEXT: vadd.i32 q1, q1, q2
+; CHECK-NEXT: vadd.i32 q4, q4, q2
+; CHECK-NEXT: vadd.i32 q6, q6, q2
; CHECK-NEXT: vadd.i32 q0, q0, q2
+; CHECK-NEXT: vmov q7, q3
+; CHECK-NEXT: vldrw.u32 q3, [sp, #136] @ 16-byte Reload
+; CHECK-NEXT: vstrw.32 q1, [sp, #264] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q1, [sp, #152] @ 16-byte Reload
+; CHECK-NEXT: vstrw.32 q4, [sp, #248] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q4, [sp, #168] @ 16-byte Reload
+; CHECK-NEXT: vstrw.32 q6, [sp, #296] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q6, [sp, #120] @ 16-byte Reload
+; CHECK-NEXT: vstrw.32 q0, [sp, #216] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q0, [sp, #280] @ 16-byte Reload
+; CHECK-NEXT: vadd.i32 q5, q5, q2
+; CHECK-NEXT: vadd.i32 q3, q3, q2
+; CHECK-NEXT: vadd.i32 q0, q0, q2
+; CHECK-NEXT: vadd.i32 q4, q4, q2
+; CHECK-NEXT: vadd.i32 q1, q1, q2
+; CHECK-NEXT: vadd.i32 q6, q6, q2
+; CHECK-NEXT: vstrw.32 q0, [sp, #280] @ 16-byte Spill
; CHECK-NEXT: bne.w .LBB15_3
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB15_2 Depth=1
@@ -1501,14 +1499,14 @@ define void @shlor(ptr nocapture %x, ptr noalias nocapture readonly %y, i32 %n)
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: blt .LBB18_3
; CHECK-NEXT: @ %bb.1: @ %vector.ph
-; CHECK-NEXT: adr.w lr, .LCPI18_0
+; CHECK-NEXT: adr r3, .LCPI18_0
; CHECK-NEXT: adr r4, .LCPI18_1
; CHECK-NEXT: adr r5, .LCPI18_2
; CHECK-NEXT: adr r6, .LCPI18_3
; CHECK-NEXT: vldrw.u32 q0, [r6]
; CHECK-NEXT: vldrw.u32 q1, [r5]
; CHECK-NEXT: vldrw.u32 q2, [r4]
-; CHECK-NEXT: vldrw.u32 q3, [lr]
+; CHECK-NEXT: vldrw.u32 q3, [r3]
; CHECK-NEXT: vadd.i32 q0, q0, r1
; CHECK-NEXT: vadd.i32 q1, q1, r1
; CHECK-NEXT: vadd.i32 q2, q2, r1
diff --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
index dad856c..00a998c 100644
--- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll
@@ -38,7 +38,7 @@ define arm_aapcs_vfpcc void @k() {
; CHECK-NEXT: vmov.i32 q5, #0x0
; CHECK-NEXT: vpsel q6, q4, q3
; CHECK-NEXT: vstrh.16 q6, [r0]
-; CHECK-NEXT: vmov q6, q5
+; CHECK-NEXT: vmov.i32 q6, #0x0
; CHECK-NEXT: cbz r1, .LBB0_2
; CHECK-NEXT: le .LBB0_1
; CHECK-NEXT: .LBB0_2: @ %for.cond4.preheader
@@ -135,12 +135,12 @@ vector.body115: ; preds = %vector.body115, %ve
define dso_local i32 @e() #0 {
; CHECK-LABEL: e:
; CHECK: @ %bb.0: @ %entry
-; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr}
-; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr}
+; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr}
+; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #408
-; CHECK-NEXT: sub sp, #408
+; CHECK-NEXT: .pad #392
+; CHECK-NEXT: sub sp, #392
; CHECK-NEXT: movw r7, :lower16:.L_MergedGlobals
; CHECK-NEXT: vldr s15, .LCPI1_1
; CHECK-NEXT: movt r7, :upper16:.L_MergedGlobals
@@ -148,18 +148,16 @@ define dso_local i32 @e() #0 {
; CHECK-NEXT: mov r4, r7
; CHECK-NEXT: mov r3, r7
; CHECK-NEXT: ldr r6, [r4, #8]!
-; CHECK-NEXT: vmov.i32 q0, #0x0
-; CHECK-NEXT: ldr r0, [r3, #4]!
-; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
; CHECK-NEXT: movt r2, :upper16:e
+; CHECK-NEXT: ldr r0, [r3, #4]!
; CHECK-NEXT: vmov r5, s15
; CHECK-NEXT: vmov q0[2], q0[0], r4, r4
-; CHECK-NEXT: vmov s13, r3
; CHECK-NEXT: vldr s12, .LCPI1_0
+; CHECK-NEXT: vmov s13, r3
; CHECK-NEXT: vmov q0[3], q0[1], r5, r2
; CHECK-NEXT: vdup.32 q7, r3
; CHECK-NEXT: vmov q6[2], q6[0], r3, r5
-; CHECK-NEXT: vstrw.32 q0, [sp, #92]
+; CHECK-NEXT: vstrw.32 q0, [sp, #76]
; CHECK-NEXT: vmov q0, q7
; CHECK-NEXT: vmov q6[3], q6[1], r3, r2
; CHECK-NEXT: vmov q4, q7
@@ -168,7 +166,7 @@ define dso_local i32 @e() #0 {
; CHECK-NEXT: vmov s21, r2
; CHECK-NEXT: movs r1, #64
; CHECK-NEXT: vmov.f32 s20, s12
-; CHECK-NEXT: str r0, [sp, #40]
+; CHECK-NEXT: str r0, [sp, #24]
; CHECK-NEXT: vmov.f32 s22, s13
; CHECK-NEXT: str r6, [r0]
; CHECK-NEXT: vmov.f32 s23, s15
@@ -186,12 +184,12 @@ define dso_local i32 @e() #0 {
; CHECK-NEXT: vmov q2[3], q2[1], r4, r5
; CHECK-NEXT: vmov.32 q4[0], r8
; CHECK-NEXT: @ implicit-def: $r2
-; CHECK-NEXT: str.w r8, [sp, #44]
-; CHECK-NEXT: vstrw.32 q3, [sp, #60]
-; CHECK-NEXT: strh.w r12, [sp, #406]
+; CHECK-NEXT: str.w r8, [sp, #28]
+; CHECK-NEXT: vstrw.32 q3, [sp, #44]
+; CHECK-NEXT: strh.w r12, [sp, #390]
; CHECK-NEXT: wlstp.8 lr, r1, .LBB1_2
; CHECK-NEXT: .LBB1_1: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: vstrb.8 q0, [r2], #16
; CHECK-NEXT: letp lr, .LBB1_1
; CHECK-NEXT: .LBB1_2: @ %entry
@@ -199,7 +197,7 @@ define dso_local i32 @e() #0 {
; CHECK-NEXT: str.w r8, [r7]
; CHECK-NEXT: vstrw.32 q4, [r0]
; CHECK-NEXT: vstrw.32 q2, [r0]
-; CHECK-NEXT: str.w r12, [sp, #324]
+; CHECK-NEXT: str.w r12, [sp, #308]
; CHECK-NEXT: .LBB1_3: @ %for.cond
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
; CHECK-NEXT: b .LBB1_3
diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
index f90af3c..2587a0bb 100644
--- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll
@@ -115,17 +115,17 @@ define void @DCT_mve2(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: movs r4, #1
; CHECK-NEXT: ldr r3, [r0]
; CHECK-NEXT: add.w r11, r3, r12, lsl #2
-; CHECK-NEXT: add.w r7, r3, r12, lsl #3
-; CHECK-NEXT: lsl.w r9, r12, #3
+; CHECK-NEXT: add.w r6, r3, r12, lsl #3
+; CHECK-NEXT: lsl.w r10, r12, #3
; CHECK-NEXT: .LBB1_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB1_3 Depth 2
; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload
+; CHECK-NEXT: add.w r9, r4, #1
; CHECK-NEXT: vmov.i32 q0, #0x0
-; CHECK-NEXT: add.w r10, r4, #1
; CHECK-NEXT: mov r3, r11
-; CHECK-NEXT: mov r0, r7
-; CHECK-NEXT: vmov q1, q0
+; CHECK-NEXT: mov r0, r6
+; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: dlstp.32 lr, r12
; CHECK-NEXT: .LBB1_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB1_2 Depth=1
@@ -139,11 +139,11 @@ define void @DCT_mve2(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1
; CHECK-NEXT: vadd.f32 s2, s2, s3
-; CHECK-NEXT: add.w r0, r2, r10, lsl #2
+; CHECK-NEXT: add.w r0, r2, r9, lsl #2
; CHECK-NEXT: vadd.f32 s0, s0, s1
-; CHECK-NEXT: add r11, r9
+; CHECK-NEXT: add r11, r10
; CHECK-NEXT: vadd.f32 s6, s6, s7
-; CHECK-NEXT: add r7, r9
+; CHECK-NEXT: add r6, r10
; CHECK-NEXT: vadd.f32 s4, s4, s5
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: vadd.f32 s2, s4, s6
@@ -228,46 +228,40 @@ define void @DCT_mve3(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9}
; CHECK-NEXT: vpush {d8, d9}
-; CHECK-NEXT: .pad #24
-; CHECK-NEXT: sub sp, #24
-; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: .pad #16
+; CHECK-NEXT: sub sp, #16
+; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [r0, #4]
-; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: subs r1, #3
-; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: cmp r1, #2
; CHECK-NEXT: blo .LBB2_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT: ldr r3, [r0, #8]
+; CHECK-NEXT: ldr.w r9, [r0, #8]
; CHECK-NEXT: movs r5, #1
; CHECK-NEXT: ldr r1, [r0]
-; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: add.w r0, r3, r3, lsl #1
-; CHECK-NEXT: add.w r9, r1, r3, lsl #2
-; CHECK-NEXT: add.w r12, r1, r3, lsl #3
-; CHECK-NEXT: adds r3, #3
+; CHECK-NEXT: add.w r3, r9, #3
; CHECK-NEXT: bic r3, r3, #3
-; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
-; CHECK-NEXT: add.w r10, r1, r0, lsl #2
+; CHECK-NEXT: add.w r0, r9, r9, lsl #1
; CHECK-NEXT: subs r3, #4
+; CHECK-NEXT: add.w r10, r1, r9, lsl #2
+; CHECK-NEXT: add.w r12, r1, r9, lsl #3
+; CHECK-NEXT: add.w r1, r1, r0, lsl #2
+; CHECK-NEXT: add.w r3, r5, r3, lsr #2
+; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
; CHECK-NEXT: lsl.w r11, r0, #2
-; CHECK-NEXT: add.w r1, r5, r3, lsr #2
-; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB2_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB2_3 Depth 2
-; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: vmov.i32 q0, #0x0
-; CHECK-NEXT: adds r0, r5, #2
-; CHECK-NEXT: adds r2, r5, #1
-; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
-; CHECK-NEXT: mov r3, r9
-; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: mov r3, r10
; CHECK-NEXT: mov r0, r12
-; CHECK-NEXT: mov r4, r10
-; CHECK-NEXT: vmov q2, q0
-; CHECK-NEXT: vmov q1, q0
-; CHECK-NEXT: dlstp.32 lr, r7
+; CHECK-NEXT: mov r4, r1
+; CHECK-NEXT: vmov.i32 q2, #0x0
+; CHECK-NEXT: vmov.i32 q1, #0x0
+; CHECK-NEXT: dlstp.32 lr, r9
; CHECK-NEXT: .LBB2_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB2_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
@@ -282,31 +276,31 @@ define void @DCT_mve3(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB2_2 Depth=1
; CHECK-NEXT: vadd.f32 s10, s10, s11
-; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: adds r0, r5, #1
; CHECK-NEXT: vadd.f32 s8, s8, s9
-; CHECK-NEXT: add r9, r11
+; CHECK-NEXT: add r10, r11
; CHECK-NEXT: vadd.f32 s6, s6, s7
-; CHECK-NEXT: add.w r0, r1, r2, lsl #2
+; CHECK-NEXT: add.w r0, r2, r0, lsl #2
; CHECK-NEXT: vadd.f32 s4, s4, s5
; CHECK-NEXT: add r12, r11
; CHECK-NEXT: vadd.f32 s2, s2, s3
-; CHECK-NEXT: add r10, r11
+; CHECK-NEXT: add r1, r11
; CHECK-NEXT: vadd.f32 s0, s0, s1
; CHECK-NEXT: vadd.f32 s8, s8, s10
; CHECK-NEXT: vadd.f32 s4, s4, s6
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: vstr s8, [r0]
-; CHECK-NEXT: add.w r0, r1, r5, lsl #2
-; CHECK-NEXT: adds r5, #3
+; CHECK-NEXT: add.w r0, r2, r5, lsl #2
; CHECK-NEXT: vstr s4, [r0]
-; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT: add.w r0, r1, r0, lsl #2
+; CHECK-NEXT: adds r0, r5, #2
+; CHECK-NEXT: adds r5, #3
+; CHECK-NEXT: add.w r0, r2, r0, lsl #2
; CHECK-NEXT: vstr s0, [r0]
-; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: cmp r5, r0
; CHECK-NEXT: blo .LBB2_2
; CHECK-NEXT: .LBB2_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #24
+; CHECK-NEXT: add sp, #16
; CHECK-NEXT: vpop {d8, d9}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -394,15 +388,15 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11}
; CHECK-NEXT: vpush {d8, d9, d10, d11}
-; CHECK-NEXT: .pad #40
-; CHECK-NEXT: sub sp, #40
-; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT: .pad #24
+; CHECK-NEXT: sub sp, #24
+; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [r0, #4]
-; CHECK-NEXT: str r2, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: subs r1, #4
-; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
; CHECK-NEXT: cmp r1, #2
-; CHECK-NEXT: blo.w .LBB3_5
+; CHECK-NEXT: blo .LBB3_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
; CHECK-NEXT: ldr r2, [r0, #8]
; CHECK-NEXT: movs r6, #1
@@ -410,34 +404,28 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: add.w r0, r2, r2, lsl #1
; CHECK-NEXT: add.w r12, r1, r2, lsl #2
; CHECK-NEXT: add.w r8, r1, r2, lsl #3
-; CHECK-NEXT: add.w r9, r1, r2, lsl #4
-; CHECK-NEXT: add.w r11, r1, r0, lsl #2
+; CHECK-NEXT: add.w r10, r1, r2, lsl #4
+; CHECK-NEXT: add.w r9, r1, r0, lsl #2
; CHECK-NEXT: adds r0, r2, #3
; CHECK-NEXT: bic r0, r0, #3
; CHECK-NEXT: subs r0, #4
; CHECK-NEXT: add.w r0, r6, r0, lsr #2
-; CHECK-NEXT: strd r0, r2, [sp, #8] @ 8-byte Folded Spill
+; CHECK-NEXT: strd r0, r2, [sp, #4] @ 8-byte Folded Spill
; CHECK-NEXT: lsls r0, r2, #4
-; CHECK-NEXT: ldrd r2, r7, [sp, #8] @ 8-byte Folded Reload
-; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: ldrd r2, r7, [sp, #4] @ 8-byte Folded Reload
+; CHECK-NEXT: str r0, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB3_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB3_3 Depth 2
-; CHECK-NEXT: adds r0, r6, #3
-; CHECK-NEXT: str r0, [sp, #36] @ 4-byte Spill
-; CHECK-NEXT: adds r0, r6, #2
-; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
; CHECK-NEXT: vmov.i32 q0, #0x0
-; CHECK-NEXT: str r0, [sp, #32] @ 4-byte Spill
-; CHECK-NEXT: adds r0, r6, #1
-; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill
; CHECK-NEXT: mov r3, r12
; CHECK-NEXT: mov r0, r8
-; CHECK-NEXT: mov r5, r11
-; CHECK-NEXT: mov r4, r9
-; CHECK-NEXT: vmov q1, q0
-; CHECK-NEXT: vmov q2, q0
-; CHECK-NEXT: vmov q3, q0
+; CHECK-NEXT: mov r5, r9
+; CHECK-NEXT: mov r4, r10
+; CHECK-NEXT: vmov.i32 q1, #0x0
+; CHECK-NEXT: vmov.i32 q2, #0x0
+; CHECK-NEXT: vmov.i32 q3, #0x0
; CHECK-NEXT: dlstp.32 lr, r7
; CHECK-NEXT: .LBB3_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB3_2 Depth=1
@@ -455,9 +443,9 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: @ %bb.4: @ %middle.block
; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1
; CHECK-NEXT: vadd.f32 s14, s14, s15
-; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: vadd.f32 s12, s12, s13
-; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: adds r0, r6, #1
; CHECK-NEXT: vadd.f32 s10, s10, s11
; CHECK-NEXT: vadd.f32 s8, s8, s9
; CHECK-NEXT: add.w r0, r1, r0, lsl #2
@@ -471,24 +459,24 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: vadd.f32 s0, s0, s2
; CHECK-NEXT: vstr s12, [r0]
; CHECK-NEXT: add.w r0, r1, r6, lsl #2
-; CHECK-NEXT: adds r6, #4
; CHECK-NEXT: vstr s8, [r0]
-; CHECK-NEXT: ldr r0, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT: adds r0, r6, #2
; CHECK-NEXT: add.w r0, r1, r0, lsl #2
; CHECK-NEXT: vstr s4, [r0]
-; CHECK-NEXT: ldr r0, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT: adds r0, r6, #3
+; CHECK-NEXT: adds r6, #4
; CHECK-NEXT: add.w r0, r1, r0, lsl #2
; CHECK-NEXT: vstr s0, [r0]
-; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload
; CHECK-NEXT: add r12, r0
; CHECK-NEXT: add r8, r0
-; CHECK-NEXT: add r11, r0
; CHECK-NEXT: add r9, r0
-; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: add r10, r0
+; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload
; CHECK-NEXT: cmp r6, r0
; CHECK-NEXT: blo .LBB3_2
; CHECK-NEXT: .LBB3_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #40
+; CHECK-NEXT: add sp, #24
; CHECK-NEXT: vpop {d8, d9, d10, d11}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -588,60 +576,53 @@ define void @DCT_mve5(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13}
-; CHECK-NEXT: .pad #32
-; CHECK-NEXT: sub sp, #32
-; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: .pad #16
+; CHECK-NEXT: sub sp, #16
+; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [r0, #4]
; CHECK-NEXT: subs r1, #5
-; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: cmp r1, #2
; CHECK-NEXT: blo.w .LBB4_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT: ldr r3, [r0, #8]
+; CHECK-NEXT: ldr.w r12, [r0, #8]
; CHECK-NEXT: ldr r1, [r0]
-; CHECK-NEXT: adds r0, r3, #3
-; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: add.w r0, r12, #3
; CHECK-NEXT: bic r0, r0, #3
-; CHECK-NEXT: add.w r8, r1, r3, lsl #2
+; CHECK-NEXT: add.w r8, r1, r12, lsl #2
; CHECK-NEXT: subs r1, r0, #4
; CHECK-NEXT: movs r0, #1
-; CHECK-NEXT: lsls r5, r3, #2
+; CHECK-NEXT: lsl.w r5, r12, #2
; CHECK-NEXT: add.w r1, r0, r1, lsr #2
-; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT: add.w r1, r3, r3, lsl #2
-; CHECK-NEXT: lsls r1, r1, #2
; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: add.w r1, r12, r12, lsl #2
+; CHECK-NEXT: lsls r1, r1, #2
+; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB4_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB4_3 Depth 2
-; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: adds r1, r0, #4
-; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT: vmov.i32 q1, #0x0
-; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT: adds r1, r0, #3
-; CHECK-NEXT: add.w r10, r0, #2
+; CHECK-NEXT: ldr r4, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: add.w r11, r0, #1
-; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: mov r3, r8
-; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT: vmov q0, q1
-; CHECK-NEXT: vmov q3, q1
-; CHECK-NEXT: vmov q2, q1
-; CHECK-NEXT: vmov q4, q1
-; CHECK-NEXT: dlstp.32 lr, r7
+; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: vmov.i32 q3, #0x0
+; CHECK-NEXT: vmov.i32 q2, #0x0
+; CHECK-NEXT: vmov.i32 q4, #0x0
+; CHECK-NEXT: dlstp.32 lr, r12
; CHECK-NEXT: .LBB4_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB4_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: add.w r9, r3, r5
; CHECK-NEXT: vldrw.u32 q5, [r4], #16
; CHECK-NEXT: vldrw.u32 q6, [r3], #16
-; CHECK-NEXT: add.w r12, r9, r5
+; CHECK-NEXT: add.w r10, r9, r5
; CHECK-NEXT: vfma.f32 q3, q6, q5
; CHECK-NEXT: vldrw.u32 q6, [r9]
-; CHECK-NEXT: add.w r6, r12, r5
+; CHECK-NEXT: add.w r6, r10, r5
; CHECK-NEXT: vfma.f32 q4, q6, q5
-; CHECK-NEXT: vldrw.u32 q6, [r12]
+; CHECK-NEXT: vldrw.u32 q6, [r10]
; CHECK-NEXT: adds r7, r6, r5
; CHECK-NEXT: vfma.f32 q2, q6, q5
; CHECK-NEXT: vldrw.u32 q6, [r6]
@@ -662,30 +643,31 @@ define void @DCT_mve5(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: vadd.f32 s8, s8, s9
; CHECK-NEXT: vadd.f32 s0, s0, s1
; CHECK-NEXT: vadd.f32 s1, s16, s18
-; CHECK-NEXT: vadd.f32 s2, s2, s3
; CHECK-NEXT: vadd.f32 s12, s12, s14
+; CHECK-NEXT: vadd.f32 s2, s2, s3
; CHECK-NEXT: vadd.f32 s4, s4, s6
; CHECK-NEXT: vadd.f32 s6, s8, s10
; CHECK-NEXT: vstr s1, [r1]
; CHECK-NEXT: add.w r1, r2, r0, lsl #2
-; CHECK-NEXT: vadd.f32 s0, s0, s2
-; CHECK-NEXT: adds r0, #5
; CHECK-NEXT: vstr s12, [r1]
-; CHECK-NEXT: add.w r1, r2, r10, lsl #2
+; CHECK-NEXT: adds r1, r0, #2
+; CHECK-NEXT: vadd.f32 s0, s0, s2
+; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s6, [r1]
-; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #3
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s0, [r1]
-; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #4
+; CHECK-NEXT: adds r0, #5
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s4, [r1]
-; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
; CHECK-NEXT: add r8, r1
-; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: cmp r0, r1
-; CHECK-NEXT: blo.w .LBB4_2
+; CHECK-NEXT: blo .LBB4_2
; CHECK-NEXT: .LBB4_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #32
+; CHECK-NEXT: add sp, #16
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -797,63 +779,54 @@ define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #32
-; CHECK-NEXT: sub sp, #32
-; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: .pad #16
+; CHECK-NEXT: sub sp, #16
+; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [r0, #4]
; CHECK-NEXT: subs r1, #6
-; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: cmp r1, #2
; CHECK-NEXT: blo.w .LBB5_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT: ldr r3, [r0, #8]
+; CHECK-NEXT: ldr.w r12, [r0, #8]
; CHECK-NEXT: ldr r1, [r0]
-; CHECK-NEXT: adds r0, r3, #3
-; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill
+; CHECK-NEXT: add.w r0, r12, #3
; CHECK-NEXT: bic r0, r0, #3
-; CHECK-NEXT: add.w r8, r1, r3, lsl #2
+; CHECK-NEXT: add.w r8, r1, r12, lsl #2
; CHECK-NEXT: subs r1, r0, #4
; CHECK-NEXT: movs r0, #1
-; CHECK-NEXT: lsls r5, r3, #2
+; CHECK-NEXT: lsl.w r5, r12, #2
; CHECK-NEXT: add.w r1, r0, r1, lsr #2
; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
-; CHECK-NEXT: add.w r1, r3, r3, lsl #1
+; CHECK-NEXT: add.w r1, r12, r12, lsl #1
; CHECK-NEXT: lsls r1, r1, #3
; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB5_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB5_3 Depth 2
-; CHECK-NEXT: adds r1, r0, #5
-; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT: adds r1, r0, #4
-; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT: adds r1, r0, #3
-; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload
-; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
-; CHECK-NEXT: vmov.i32 q1, #0x0
-; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
-; CHECK-NEXT: add.w r11, r0, #2
+; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: adds r4, r0, #1
-; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: vmov.i32 q1, #0x0
; CHECK-NEXT: mov r3, r8
-; CHECK-NEXT: vmov q3, q1
-; CHECK-NEXT: vmov q4, q1
-; CHECK-NEXT: vmov q0, q1
-; CHECK-NEXT: vmov q5, q1
-; CHECK-NEXT: vmov q2, q1
-; CHECK-NEXT: dlstp.32 lr, r7
+; CHECK-NEXT: vmov.i32 q3, #0x0
+; CHECK-NEXT: vmov.i32 q4, #0x0
+; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: vmov.i32 q5, #0x0
+; CHECK-NEXT: vmov.i32 q2, #0x0
+; CHECK-NEXT: dlstp.32 lr, r12
; CHECK-NEXT: .LBB5_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB5_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
-; CHECK-NEXT: add.w r12, r3, r5
+; CHECK-NEXT: add.w r10, r3, r5
; CHECK-NEXT: vldrw.u32 q6, [r1], #16
; CHECK-NEXT: vldrw.u32 q7, [r3], #16
-; CHECK-NEXT: add.w r10, r12, r5
+; CHECK-NEXT: add.w r11, r10, r5
; CHECK-NEXT: vfma.f32 q4, q7, q6
-; CHECK-NEXT: vldrw.u32 q7, [r12]
-; CHECK-NEXT: add.w r6, r10, r5
-; CHECK-NEXT: vfma.f32 q5, q7, q6
; CHECK-NEXT: vldrw.u32 q7, [r10]
+; CHECK-NEXT: add.w r6, r11, r5
+; CHECK-NEXT: vfma.f32 q5, q7, q6
+; CHECK-NEXT: vldrw.u32 q7, [r11]
; CHECK-NEXT: adds r7, r6, r5
; CHECK-NEXT: vfma.f32 q2, q7, q6
; CHECK-NEXT: vldrw.u32 q7, [r6]
@@ -885,28 +858,29 @@ define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: vstr s1, [r1]
; CHECK-NEXT: add.w r1, r2, r0, lsl #2
; CHECK-NEXT: vadd.f32 s0, s0, s2
-; CHECK-NEXT: adds r0, #6
; CHECK-NEXT: vstr s3, [r1]
-; CHECK-NEXT: add.w r1, r2, r11, lsl #2
+; CHECK-NEXT: adds r1, r0, #2
; CHECK-NEXT: vadd.f32 s4, s4, s6
+; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s8, [r1]
-; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #3
; CHECK-NEXT: vadd.f32 s6, s12, s14
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s0, [r1]
-; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #4
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s6, [r1]
-; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #5
+; CHECK-NEXT: adds r0, #6
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s4, [r1]
; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
; CHECK-NEXT: add r8, r1
-; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: blo.w .LBB5_2
; CHECK-NEXT: .LBB5_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #32
+; CHECK-NEXT: add sp, #16
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -1030,73 +1004,64 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #72
-; CHECK-NEXT: sub sp, #72
-; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: .pad #48
+; CHECK-NEXT: sub sp, #48
+; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [r0, #4]
; CHECK-NEXT: subs r1, #7
-; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: cmp r1, #2
; CHECK-NEXT: blo.w .LBB6_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT: ldr r3, [r0, #8]
+; CHECK-NEXT: ldr.w r10, [r0, #8]
; CHECK-NEXT: ldr r1, [r0]
-; CHECK-NEXT: adds r0, r3, #3
-; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: add.w r0, r10, #3
; CHECK-NEXT: bic r0, r0, #3
-; CHECK-NEXT: add.w r9, r1, r3, lsl #2
+; CHECK-NEXT: add.w r9, r1, r10, lsl #2
; CHECK-NEXT: subs r1, r0, #4
; CHECK-NEXT: movs r0, #1
-; CHECK-NEXT: lsls r5, r3, #2
+; CHECK-NEXT: lsl.w r5, r10, #2
; CHECK-NEXT: add.w r1, r0, r1, lsr #2
-; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT: rsb r1, r3, r3, lsl #3
-; CHECK-NEXT: lsls r1, r1, #2
; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: rsb r1, r10, r10, lsl #3
+; CHECK-NEXT: lsls r1, r1, #2
+; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB6_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB6_3 Depth 2
-; CHECK-NEXT: adds r1, r0, #6
-; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill
-; CHECK-NEXT: adds r1, r0, #5
-; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill
-; CHECK-NEXT: adds r1, r0, #4
-; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT: adds r1, r0, #3
-; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
-; CHECK-NEXT: vmov.i32 q2, #0x0
-; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload
; CHECK-NEXT: adds r4, r0, #2
; CHECK-NEXT: add.w r8, r0, #1
-; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: vmov.i32 q2, #0x0
; CHECK-NEXT: mov r3, r9
-; CHECK-NEXT: vmov q4, q2
-; CHECK-NEXT: vmov q5, q2
-; CHECK-NEXT: vmov q3, q2
-; CHECK-NEXT: vmov q6, q2
-; CHECK-NEXT: vmov q1, q2
-; CHECK-NEXT: mov r12, r7
-; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill
-; CHECK-NEXT: dls lr, r6
+; CHECK-NEXT: vmov.i32 q4, #0x0
+; CHECK-NEXT: vmov.i32 q5, #0x0
+; CHECK-NEXT: vmov.i32 q3, #0x0
+; CHECK-NEXT: vmov.i32 q6, #0x0
+; CHECK-NEXT: vmov.i32 q1, #0x0
+; CHECK-NEXT: mov r12, r10
+; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT: dls lr, r7
; CHECK-NEXT: .LBB6_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB6_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vctp.32 r12
-; CHECK-NEXT: add.w r10, r3, r5
+; CHECK-NEXT: add.w r11, r3, r5
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q7, [r1], #16
; CHECK-NEXT: vldrwt.u32 q0, [r3], #16
-; CHECK-NEXT: add.w r11, r10, r5
+; CHECK-NEXT: add.w r6, r11, r5
; CHECK-NEXT: sub.w r12, r12, #4
; CHECK-NEXT: vpstt
; CHECK-NEXT: vfmat.f32 q5, q0, q7
-; CHECK-NEXT: vldrwt.u32 q0, [r10]
-; CHECK-NEXT: add.w r6, r11, r5
+; CHECK-NEXT: vldrwt.u32 q0, [r11]
+; CHECK-NEXT: adds r7, r6, r5
; CHECK-NEXT: vpstt
; CHECK-NEXT: vfmat.f32 q6, q0, q7
-; CHECK-NEXT: vldrwt.u32 q0, [r11]
-; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill
+; CHECK-NEXT: vldrwt.u32 q0, [r6]
+; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill
; CHECK-NEXT: vmov q6, q5
; CHECK-NEXT: vpst
; CHECK-NEXT: vfmat.f32 q1, q0, q7
@@ -1104,26 +1069,26 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: vmov q4, q3
; CHECK-NEXT: vmov q3, q1
; CHECK-NEXT: vpst
-; CHECK-NEXT: vldrwt.u32 q0, [r6]
-; CHECK-NEXT: vldrw.u32 q1, [sp, #56] @ 16-byte Reload
-; CHECK-NEXT: adds r7, r6, r5
-; CHECK-NEXT: vpstt
-; CHECK-NEXT: vfmat.f32 q1, q0, q7
; CHECK-NEXT: vldrwt.u32 q0, [r7]
+; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload
; CHECK-NEXT: adds r6, r7, r5
-; CHECK-NEXT: vstrw.32 q1, [sp, #56] @ 16-byte Spill
+; CHECK-NEXT: vpstt
+; CHECK-NEXT: vfmat.f32 q1, q0, q7
+; CHECK-NEXT: vldrwt.u32 q0, [r6]
+; CHECK-NEXT: adds r7, r6, r5
+; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill
; CHECK-NEXT: vmov q1, q3
; CHECK-NEXT: vmov q3, q4
; CHECK-NEXT: vpstt
; CHECK-NEXT: vfmat.f32 q3, q0, q7
-; CHECK-NEXT: vldrwt.u32 q0, [r6]
+; CHECK-NEXT: vldrwt.u32 q0, [r7]
; CHECK-NEXT: vmov q4, q5
-; CHECK-NEXT: adds r7, r6, r5
+; CHECK-NEXT: adds r6, r7, r5
; CHECK-NEXT: vpstt
; CHECK-NEXT: vfmat.f32 q4, q0, q7
-; CHECK-NEXT: vldrwt.u32 q0, [r7]
+; CHECK-NEXT: vldrwt.u32 q0, [r6]
; CHECK-NEXT: vmov q5, q6
-; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload
; CHECK-NEXT: vpst
; CHECK-NEXT: vfmat.f32 q2, q0, q7
; CHECK-NEXT: le lr, .LBB6_3
@@ -1138,45 +1103,45 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: vadd.f32 s4, s4, s5
; CHECK-NEXT: vadd.f32 s10, s10, s11
; CHECK-NEXT: vadd.f32 s8, s8, s9
-; CHECK-NEXT: vadd.f32 s0, s2, s0
; CHECK-NEXT: vadd.f32 s9, s18, s19
; CHECK-NEXT: vadd.f32 s11, s16, s17
-; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload
-; CHECK-NEXT: vadd.f32 s2, s3, s1
+; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload
+; CHECK-NEXT: vadd.f32 s0, s2, s0
; CHECK-NEXT: vadd.f32 s5, s18, s19
; CHECK-NEXT: vadd.f32 s7, s16, s17
+; CHECK-NEXT: vadd.f32 s2, s3, s1
; CHECK-NEXT: vadd.f32 s4, s4, s6
-; CHECK-NEXT: vstr s0, [r1]
-; CHECK-NEXT: add.w r1, r2, r0, lsl #2
; CHECK-NEXT: vadd.f32 s14, s14, s15
-; CHECK-NEXT: adds r0, #7
; CHECK-NEXT: vadd.f32 s12, s12, s13
-; CHECK-NEXT: vstr s2, [r1]
-; CHECK-NEXT: add.w r1, r2, r4, lsl #2
+; CHECK-NEXT: vstr s0, [r1]
+; CHECK-NEXT: add.w r1, r2, r0, lsl #2
; CHECK-NEXT: vadd.f32 s8, s8, s10
; CHECK-NEXT: vadd.f32 s6, s7, s5
-; CHECK-NEXT: vstr s4, [r1]
+; CHECK-NEXT: vstr s2, [r1]
+; CHECK-NEXT: add.w r1, r2, r4, lsl #2
; CHECK-NEXT: vadd.f32 s10, s11, s9
-; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
+; CHECK-NEXT: vstr s4, [r1]
+; CHECK-NEXT: adds r1, r0, #3
; CHECK-NEXT: vadd.f32 s12, s12, s14
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s6, [r1]
-; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #4
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s12, [r1]
-; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #5
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s10, [r1]
-; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #6
+; CHECK-NEXT: adds r0, #7
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s8, [r1]
-; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
; CHECK-NEXT: add r9, r1
-; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: blo.w .LBB6_2
; CHECK-NEXT: .LBB6_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #72
+; CHECK-NEXT: add sp, #48
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
@@ -1312,107 +1277,99 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: sub sp, #4
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #88
-; CHECK-NEXT: sub sp, #88
-; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill
+; CHECK-NEXT: .pad #64
+; CHECK-NEXT: sub sp, #64
+; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill
; CHECK-NEXT: ldr r1, [r0, #4]
; CHECK-NEXT: subs r1, #8
-; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill
+; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
; CHECK-NEXT: cmp r1, #2
; CHECK-NEXT: blo.w .LBB7_5
; CHECK-NEXT: @ %bb.1: @ %for.body.preheader
-; CHECK-NEXT: ldr r3, [r0, #8]
+; CHECK-NEXT: ldr.w r11, [r0, #8]
; CHECK-NEXT: ldr r1, [r0]
-; CHECK-NEXT: adds r0, r3, #3
-; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill
+; CHECK-NEXT: add.w r0, r11, #3
; CHECK-NEXT: bic r0, r0, #3
-; CHECK-NEXT: add.w r12, r1, r3, lsl #2
+; CHECK-NEXT: add.w r12, r1, r11, lsl #2
; CHECK-NEXT: subs r1, r0, #4
; CHECK-NEXT: movs r0, #1
-; CHECK-NEXT: lsls r6, r3, #2
+; CHECK-NEXT: lsl.w r6, r11, #2
; CHECK-NEXT: add.w r1, r0, r1, lsr #2
-; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill
-; CHECK-NEXT: lsls r1, r3, #5
; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill
+; CHECK-NEXT: lsl.w r1, r11, #5
+; CHECK-NEXT: str r1, [sp] @ 4-byte Spill
; CHECK-NEXT: .LBB7_2: @ %for.body
; CHECK-NEXT: @ =>This Loop Header: Depth=1
; CHECK-NEXT: @ Child Loop BB7_3 Depth 2
-; CHECK-NEXT: adds r1, r0, #7
-; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill
-; CHECK-NEXT: adds r1, r0, #6
-; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill
-; CHECK-NEXT: adds r1, r0, #5
-; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload
-; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill
-; CHECK-NEXT: adds r1, r0, #4
-; CHECK-NEXT: ldr.w r9, [sp, #20] @ 4-byte Reload
-; CHECK-NEXT: vmov.i32 q3, #0x0
-; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill
+; CHECK-NEXT: ldr.w r9, [sp, #12] @ 4-byte Reload
+; CHECK-NEXT: vmov.i32 q0, #0x0
; CHECK-NEXT: adds r4, r0, #3
; CHECK-NEXT: add.w r8, r0, #2
; CHECK-NEXT: adds r1, r0, #1
-; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload
+; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT: vmov.i32 q0, #0x0
+; CHECK-NEXT: vmov.i32 q3, #0x0
; CHECK-NEXT: mov r3, r12
-; CHECK-NEXT: vmov q5, q3
-; CHECK-NEXT: vmov q6, q3
-; CHECK-NEXT: vmov q4, q3
-; CHECK-NEXT: vmov q7, q3
-; CHECK-NEXT: vmov q2, q3
-; CHECK-NEXT: mov r10, r7
-; CHECK-NEXT: vstrw.32 q3, [sp, #56] @ 16-byte Spill
-; CHECK-NEXT: vstrw.32 q3, [sp, #72] @ 16-byte Spill
-; CHECK-NEXT: dls lr, r5
+; CHECK-NEXT: vmov.i32 q5, #0x0
+; CHECK-NEXT: vmov.i32 q6, #0x0
+; CHECK-NEXT: vmov.i32 q4, #0x0
+; CHECK-NEXT: vmov.i32 q7, #0x0
+; CHECK-NEXT: vmov.i32 q2, #0x0
+; CHECK-NEXT: mov r10, r11
+; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT: dls lr, r7
; CHECK-NEXT: .LBB7_3: @ %vector.body
; CHECK-NEXT: @ Parent Loop BB7_2 Depth=1
; CHECK-NEXT: @ => This Inner Loop Header: Depth=2
; CHECK-NEXT: vctp.32 r10
-; CHECK-NEXT: add.w r11, r3, r6
+; CHECK-NEXT: adds r5, r3, r6
; CHECK-NEXT: vpstt
; CHECK-NEXT: vldrwt.u32 q0, [r9], #16
; CHECK-NEXT: vldrwt.u32 q1, [r3], #16
-; CHECK-NEXT: add.w r5, r11, r6
+; CHECK-NEXT: adds r7, r5, r6
; CHECK-NEXT: sub.w r10, r10, #4
; CHECK-NEXT: vpstt
; CHECK-NEXT: vfmat.f32 q6, q1, q0
-; CHECK-NEXT: vldrwt.u32 q1, [r11]
-; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill
+; CHECK-NEXT: vldrwt.u32 q1, [r5]
+; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill
; CHECK-NEXT: vmov q6, q5
; CHECK-NEXT: vpst
; CHECK-NEXT: vfmat.f32 q7, q1, q0
-; CHECK-NEXT: vmov q5, q3
-; CHECK-NEXT: vmov q3, q4
-; CHECK-NEXT: vmov q4, q2
+; CHECK-NEXT: vmov q5, q4
+; CHECK-NEXT: vmov q4, q3
+; CHECK-NEXT: vmov q3, q2
; CHECK-NEXT: vpst
-; CHECK-NEXT: vldrwt.u32 q1, [r5]
-; CHECK-NEXT: vldrw.u32 q2, [sp, #56] @ 16-byte Reload
-; CHECK-NEXT: adds r7, r5, r6
-; CHECK-NEXT: vpstt
-; CHECK-NEXT: vfmat.f32 q2, q1, q0
; CHECK-NEXT: vldrwt.u32 q1, [r7]
-; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill
-; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload
; CHECK-NEXT: adds r5, r7, r6
; CHECK-NEXT: vpstt
; CHECK-NEXT: vfmat.f32 q2, q1, q0
; CHECK-NEXT: vldrwt.u32 q1, [r5]
+; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill
+; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload
; CHECK-NEXT: adds r7, r5, r6
-; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill
-; CHECK-NEXT: vmov q2, q4
-; CHECK-NEXT: vmov q4, q3
; CHECK-NEXT: vpstt
; CHECK-NEXT: vfmat.f32 q2, q1, q0
; CHECK-NEXT: vldrwt.u32 q1, [r7]
; CHECK-NEXT: adds r5, r7, r6
-; CHECK-NEXT: vmov q3, q5
+; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill
+; CHECK-NEXT: vmov q2, q3
+; CHECK-NEXT: vmov q3, q4
; CHECK-NEXT: vpstt
-; CHECK-NEXT: vfmat.f32 q4, q1, q0
+; CHECK-NEXT: vfmat.f32 q2, q1, q0
; CHECK-NEXT: vldrwt.u32 q1, [r5]
+; CHECK-NEXT: vmov q4, q5
+; CHECK-NEXT: adds r7, r5, r6
+; CHECK-NEXT: vpstt
+; CHECK-NEXT: vfmat.f32 q4, q1, q0
+; CHECK-NEXT: vldrwt.u32 q1, [r7]
; CHECK-NEXT: vmov q5, q6
-; CHECK-NEXT: add r5, r6
+; CHECK-NEXT: adds r5, r7, r6
; CHECK-NEXT: vpstt
; CHECK-NEXT: vfmat.f32 q5, q1, q0
; CHECK-NEXT: vldrwt.u32 q1, [r5]
-; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload
; CHECK-NEXT: vpst
; CHECK-NEXT: vfmat.f32 q3, q1, q0
; CHECK-NEXT: le lr, .LBB7_3
@@ -1425,12 +1382,12 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: vadd.f32 s6, s24, s25
; CHECK-NEXT: vadd.f32 s5, s18, s19
; CHECK-NEXT: vadd.f32 s7, s16, s17
-; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload
; CHECK-NEXT: vadd.f32 s10, s10, s11
; CHECK-NEXT: vadd.f32 s8, s8, s9
; CHECK-NEXT: vadd.f32 s9, s18, s19
; CHECK-NEXT: vadd.f32 s11, s16, s17
-; CHECK-NEXT: vldrw.u32 q4, [sp, #72] @ 16-byte Reload
+; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload
; CHECK-NEXT: vadd.f32 s14, s14, s15
; CHECK-NEXT: vadd.f32 s12, s12, s13
; CHECK-NEXT: vadd.f32 s13, s18, s19
@@ -1445,33 +1402,33 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt
; CHECK-NEXT: vstr s0, [r1]
; CHECK-NEXT: add.w r1, r2, r0, lsl #2
; CHECK-NEXT: vadd.f32 s3, s20, s21
-; CHECK-NEXT: adds r0, #8
; CHECK-NEXT: vstr s2, [r1]
; CHECK-NEXT: add.w r1, r2, r8, lsl #2
; CHECK-NEXT: vadd.f32 s12, s7, s5
; CHECK-NEXT: vstr s10, [r1]
; CHECK-NEXT: add.w r1, r2, r4, lsl #2
; CHECK-NEXT: vstr s14, [r1]
-; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload
-; CHECK-NEXT: vadd.f32 s4, s3, s1
+; CHECK-NEXT: adds r1, r0, #4
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
+; CHECK-NEXT: vadd.f32 s4, s3, s1
; CHECK-NEXT: vstr s8, [r1]
-; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #5
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s12, [r1]
-; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #6
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s4, [r1]
-; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload
+; CHECK-NEXT: adds r1, r0, #7
+; CHECK-NEXT: adds r0, #8
; CHECK-NEXT: add.w r1, r2, r1, lsl #2
; CHECK-NEXT: vstr s6, [r1]
-; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload
; CHECK-NEXT: add r12, r1
-; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload
+; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload
; CHECK-NEXT: cmp r0, r1
; CHECK-NEXT: blo.w .LBB7_2
; CHECK-NEXT: .LBB7_5: @ %for.cond.cleanup
-; CHECK-NEXT: add sp, #88
+; CHECK-NEXT: add sp, #64
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: add sp, #4
; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc}
diff --git a/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll b/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll
index 29c4fb9..413c4a1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll
@@ -1496,15 +1496,14 @@ define void @vfmasq(ptr %x, ptr %y, i32 %n) {
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
; CHECK-NEXT: .LBB34_1: @ %for.body.preheader
-; CHECK-NEXT: vmov.f32 q0, #1.000000e+01
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB34_2: @ %for.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmov q3, q0
-; CHECK-NEXT: vldrw.u32 q1, [r1]
-; CHECK-NEXT: vldrw.u32 q2, [r0], #16
-; CHECK-NEXT: vfma.f32 q3, q2, q1
-; CHECK-NEXT: vstrw.32 q3, [r1], #16
+; CHECK-NEXT: vmov.f32 q2, #1.000000e+01
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vldrw.u32 q1, [r0], #16
+; CHECK-NEXT: vfma.f32 q2, q1, q0
+; CHECK-NEXT: vstrw.32 q2, [r1], #16
; CHECK-NEXT: letp lr, .LBB34_2
; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup
; CHECK-NEXT: pop {r7, pc}
@@ -1542,15 +1541,14 @@ define void @vfmas(ptr %s1, ptr %s2, i32 %N) {
; CHECK-NEXT: it lt
; CHECK-NEXT: poplt {r7, pc}
; CHECK-NEXT: .LBB35_1: @ %while.body.lr.ph
-; CHECK-NEXT: vmov.f32 q0, #1.000000e+01
; CHECK-NEXT: dlstp.32 lr, r2
; CHECK-NEXT: .LBB35_2: @ %while.body
; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: vmov q3, q0
-; CHECK-NEXT: vldrw.u32 q1, [r1]
-; CHECK-NEXT: vldrw.u32 q2, [r0]
-; CHECK-NEXT: vfma.f32 q3, q2, q1
-; CHECK-NEXT: vstrw.32 q3, [r0], #16
+; CHECK-NEXT: vmov.f32 q2, #1.000000e+01
+; CHECK-NEXT: vldrw.u32 q0, [r1]
+; CHECK-NEXT: vldrw.u32 q1, [r0]
+; CHECK-NEXT: vfma.f32 q2, q1, q0
+; CHECK-NEXT: vstrw.32 q2, [r0], #16
; CHECK-NEXT: letp lr, .LBB35_2
; CHECK-NEXT: @ %bb.3: @ %while.end
; CHECK-NEXT: pop {r7, pc}
diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
index e845070..62482c1 100644
--- a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll
@@ -287,17 +287,17 @@ define void @shlor(ptr nocapture readonly %x, ptr noalias nocapture %y, i32 %n)
; CHECK-NEXT: cmp r2, #1
; CHECK-NEXT: blt .LBB5_3
; CHECK-NEXT: @ %bb.1: @ %vector.ph
-; CHECK-NEXT: adr.w lr, .LCPI5_0
-; CHECK-NEXT: adr r4, .LCPI5_1
+; CHECK-NEXT: adr r4, .LCPI5_0
+; CHECK-NEXT: adr r3, .LCPI5_1
; CHECK-NEXT: adr r5, .LCPI5_2
; CHECK-NEXT: adr r6, .LCPI5_3
-; CHECK-NEXT: vldrw.u32 q2, [r4]
+; CHECK-NEXT: vldrw.u32 q2, [r3]
+; CHECK-NEXT: vldrw.u32 q3, [r4]
; CHECK-NEXT: vldrw.u32 q0, [r6]
; CHECK-NEXT: vldrw.u32 q1, [r5]
-; CHECK-NEXT: vldrw.u32 q3, [lr]
+; CHECK-NEXT: vadd.i32 q2, q2, r1
; CHECK-NEXT: vadd.i32 q0, q0, r1
; CHECK-NEXT: vadd.i32 q1, q1, r1
-; CHECK-NEXT: vadd.i32 q2, q2, r1
; CHECK-NEXT: vadd.i32 q3, q3, r1
; CHECK-NEXT: mov.w r12, #1
; CHECK-NEXT: movs r4, #3
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
index f9948db..c92c2be 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll
@@ -656,14 +656,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
; CHECK: @ %bb.0: @ %entry
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #16
-; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: vmov q2, q0
; CHECK-NEXT: vcmp.i8 eq, q1, zr
; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vmov.i8 q1, #0xff
; CHECK-NEXT: vpsel q5, q1, q0
-; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vmov.u8 r0, q5[0]
; CHECK-NEXT: vmov.16 q3[0], r0
; CHECK-NEXT: vmov.u8 r0, q5[1]
@@ -706,7 +704,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
; CHECK-NEXT: orrs r1, r3
; CHECK-NEXT: add r0, r2
; CHECK-NEXT: vmov r2, r3, d15
-; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q7, #0x0
; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
; CHECK-NEXT: vmov q0[3], q0[1], r2, r3
; CHECK-NEXT: vmov.u8 r2, q2[3]
@@ -785,6 +783,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
; CHECK-NEXT: vcmp.i32 ne, q0, zr
; CHECK-NEXT: vpsel q6, q1, q7
+; CHECK-NEXT: vmov.i8 q7, #0x0
; CHECK-NEXT: vmov r2, r3, d12
; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
; CHECK-NEXT: vmov q0[3], q0[1], r2, r3
@@ -853,7 +852,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) {
; CHECK-NEXT: vmov r2, r3, d1
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
-; CHECK-NEXT: add sp, #16
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: bx lr
entry:
@@ -2065,14 +2063,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #16
-; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: vmov q2, q0
; CHECK-NEXT: vcmp.i8 eq, q1, zr
; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vmov.i8 q1, #0xff
; CHECK-NEXT: vpsel q5, q1, q0
-; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vmov.u8 r2, q5[0]
; CHECK-NEXT: vmov.16 q3[0], r2
; CHECK-NEXT: vmov.u8 r2, q5[1]
@@ -2115,7 +2111,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
; CHECK-NEXT: orr.w lr, lr, r3
; CHECK-NEXT: add r12, r2
; CHECK-NEXT: vmov r3, r2, d15
-; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q7, #0x0
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
; CHECK-NEXT: vmov.u8 r2, q2[3]
@@ -2194,6 +2190,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
; CHECK-NEXT: vcmp.i32 ne, q0, zr
; CHECK-NEXT: vpsel q6, q1, q7
+; CHECK-NEXT: vmov.i8 q7, #0x0
; CHECK-NEXT: vmov r2, r3, d12
; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
; CHECK-NEXT: vmov q0[3], q0[1], r2, r3
@@ -2264,7 +2261,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b
; CHECK-NEXT: adc.w r3, r3, lr
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
-; CHECK-NEXT: add sp, #16
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: pop {r7, pc}
entry:
diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
index 63b1431..9f55183 100644
--- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
+++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll
@@ -817,16 +817,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #32
-; CHECK-NEXT: sub sp, #32
; CHECK-NEXT: vmov q3, q0
-; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vcmp.i8 eq, q2, zr
+; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vmov.i8 q2, #0xff
; CHECK-NEXT: vpsel q6, q2, q0
-; CHECK-NEXT: vmov q4, q0
+; CHECK-NEXT: vmov.i8 q2, #0xff
; CHECK-NEXT: vmov.u8 r0, q6[0]
-; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT: vmov.i8 q4, #0x0
; CHECK-NEXT: vmov.16 q0[0], r0
; CHECK-NEXT: vmov.u8 r0, q6[1]
; CHECK-NEXT: vmov.16 q0[1], r0
@@ -842,9 +840,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: vmov.16 q0[6], r0
; CHECK-NEXT: vmov.u8 r0, q6[7]
; CHECK-NEXT: vmov.16 q0[7], r0
-; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT: vcmp.i16 ne, q0, zr
; CHECK-NEXT: vmov.u8 r2, q3[0]
+; CHECK-NEXT: vcmp.i16 ne, q0, zr
; CHECK-NEXT: vpsel q7, q2, q4
; CHECK-NEXT: vmov.u16 r0, q7[2]
; CHECK-NEXT: vmov.u16 r1, q7[0]
@@ -895,7 +892,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov r0, s18
; CHECK-NEXT: vmov r1, s16
-; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q4, #0xff
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: umull r0, r2, r0, r2
; CHECK-NEXT: umull r1, r3, r1, r3
@@ -916,8 +913,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: vmov.u16 r3, q7[5]
; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
; CHECK-NEXT: vcmp.i32 ne, q0, zr
-; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT: vpsel q0, q0, q4
+; CHECK-NEXT: vmov.i8 q0, #0x0
+; CHECK-NEXT: vpsel q0, q4, q0
; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: vmov q4[2], q4[0], r2, r3
; CHECK-NEXT: vmov q4[3], q4[1], r2, r3
@@ -932,7 +929,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: vmov r2, s18
; CHECK-NEXT: vmov r0, s30
; CHECK-NEXT: vmov r1, s28
-; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q7, #0xff
; CHECK-NEXT: vmov r3, s16
; CHECK-NEXT: umull r0, r2, r0, r2
; CHECK-NEXT: umull r1, r3, r1, r3
@@ -960,7 +957,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov r0, s18
; CHECK-NEXT: vmov r1, s16
-; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q4, #0x0
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: umull r0, r2, r0, r2
; CHECK-NEXT: umull r1, r3, r1, r3
@@ -1041,7 +1038,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov r0, s18
; CHECK-NEXT: vmov r1, s16
-; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q4, #0x0
; CHECK-NEXT: vmov r3, s0
; CHECK-NEXT: umull r0, r2, r0, r2
; CHECK-NEXT: umull r1, r3, r1, r3
@@ -1062,7 +1059,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: vmov.u16 r3, q6[5]
; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
; CHECK-NEXT: vcmp.i32 ne, q0, zr
-; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q0, #0xff
; CHECK-NEXT: vpsel q0, q0, q4
; CHECK-NEXT: vmov r2, r3, d0
; CHECK-NEXT: vmov q4[2], q4[0], r2, r3
@@ -1117,7 +1114,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: adc.w r1, r1, lr
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
-; CHECK-NEXT: add sp, #32
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: pop {r7, pc}
entry:
@@ -1137,16 +1133,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: push {r7, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #16
-; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: vmov q3, q0
; CHECK-NEXT: vcmp.i8 eq, q2, zr
; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vmov.i8 q2, #0xff
; CHECK-NEXT: vpsel q5, q2, q0
-; CHECK-NEXT: vmov.s8 r2, q1[0]
+; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vmov.u8 r0, q5[0]
-; CHECK-NEXT: vmov.s8 r3, q3[0]
+; CHECK-NEXT: vmov.s8 r2, q1[0]
; CHECK-NEXT: vmov.16 q4[0], r0
; CHECK-NEXT: vmov.u8 r0, q5[1]
; CHECK-NEXT: vmov.16 q4[1], r0
@@ -1162,9 +1156,9 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: vmov.16 q4[6], r0
; CHECK-NEXT: vmov.u8 r0, q5[7]
; CHECK-NEXT: vmov.16 q4[7], r0
-; CHECK-NEXT: smull r2, r3, r3, r2
+; CHECK-NEXT: vmov.s8 r3, q3[0]
; CHECK-NEXT: vcmp.i16 ne, q4, zr
-; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vpsel q6, q2, q0
; CHECK-NEXT: vmov.u16 r0, q6[2]
; CHECK-NEXT: vmov.u16 r1, q6[0]
@@ -1198,7 +1192,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: vmov.s8 r3, q3[3]
; CHECK-NEXT: smull r0, r1, r1, r0
; CHECK-NEXT: vcmp.i32 ne, q0, zr
-; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q7, #0x0
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vmov q0[2], q0[0], r0, r2
; CHECK-NEXT: vmov q0[3], q0[1], r1, r3
@@ -1219,7 +1213,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
; CHECK-NEXT: smull r0, r1, r1, r0
; CHECK-NEXT: vcmp.i32 ne, q0, zr
-; CHECK-NEXT: vpsel q6, q2, q7
+; CHECK-NEXT: vmov.i8 q0, #0x0
+; CHECK-NEXT: vpsel q6, q2, q0
; CHECK-NEXT: vmov r2, r3, d12
; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
; CHECK-NEXT: vmov q0[3], q0[1], r2, r3
@@ -1273,17 +1268,18 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: vmov.16 q6[7], r2
; CHECK-NEXT: vmov.s8 r0, q1[8]
; CHECK-NEXT: vcmp.i16 ne, q6, zr
+; CHECK-NEXT: vmov.i8 q6, #0x0
+; CHECK-NEXT: vpsel q5, q2, q6
; CHECK-NEXT: vmov.s8 r1, q3[8]
-; CHECK-NEXT: vpsel q5, q2, q7
-; CHECK-NEXT: smull r0, r1, r1, r0
; CHECK-NEXT: vmov.u16 r2, q5[2]
; CHECK-NEXT: vmov.u16 r3, q5[0]
; CHECK-NEXT: vmov q0[2], q0[0], r3, r2
; CHECK-NEXT: vmov.u16 r2, q5[3]
; CHECK-NEXT: vmov.u16 r3, q5[1]
+; CHECK-NEXT: smull r0, r1, r1, r0
; CHECK-NEXT: vmov q0[3], q0[1], r3, r2
; CHECK-NEXT: vcmp.i32 ne, q0, zr
-; CHECK-NEXT: vpsel q6, q2, q7
+; CHECK-NEXT: vpsel q6, q2, q6
; CHECK-NEXT: vmov r2, r3, d12
; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
; CHECK-NEXT: vmov q0[3], q0[1], r2, r3
@@ -1365,7 +1361,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1
; CHECK-NEXT: adc.w r1, r1, lr
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
-; CHECK-NEXT: add sp, #16
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: pop {r7, pc}
entry:
@@ -2296,16 +2291,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #32
-; CHECK-NEXT: sub sp, #32
; CHECK-NEXT: vmov q3, q0
-; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vcmp.i8 eq, q2, zr
+; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vmov.i8 q2, #0xff
; CHECK-NEXT: vpsel q6, q2, q0
-; CHECK-NEXT: vmov q4, q0
+; CHECK-NEXT: vmov.i8 q2, #0xff
; CHECK-NEXT: vmov.u8 r2, q6[0]
-; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT: vmov.i8 q4, #0x0
; CHECK-NEXT: vmov.16 q0[0], r2
; CHECK-NEXT: vmov.u8 r2, q6[1]
; CHECK-NEXT: vmov.16 q0[1], r2
@@ -2321,9 +2314,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: vmov.16 q0[6], r2
; CHECK-NEXT: vmov.u8 r2, q6[7]
; CHECK-NEXT: vmov.16 q0[7], r2
-; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill
-; CHECK-NEXT: vcmp.i16 ne, q0, zr
; CHECK-NEXT: vmov.u8 r4, q3[2]
+; CHECK-NEXT: vcmp.i16 ne, q0, zr
; CHECK-NEXT: vpsel q7, q2, q4
; CHECK-NEXT: vmov.u16 r2, q7[2]
; CHECK-NEXT: vmov.u16 r3, q7[0]
@@ -2374,7 +2366,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: vmov r2, s2
; CHECK-NEXT: vmov r3, s18
; CHECK-NEXT: vmov r5, s16
-; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q4, #0xff
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: umull r2, r3, r3, r2
; CHECK-NEXT: umull r4, r5, r5, r4
@@ -2395,8 +2387,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: vmov.u16 r4, q7[5]
; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
; CHECK-NEXT: vcmp.i32 ne, q0, zr
-; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
-; CHECK-NEXT: vpsel q0, q0, q4
+; CHECK-NEXT: vmov.i8 q0, #0x0
+; CHECK-NEXT: vpsel q0, q4, q0
; CHECK-NEXT: vmov r5, r4, d0
; CHECK-NEXT: vmov q4[2], q4[0], r5, r4
; CHECK-NEXT: vmov q4[3], q4[1], r5, r4
@@ -2411,7 +2403,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: vmov r5, s18
; CHECK-NEXT: vmov r2, s30
; CHECK-NEXT: vmov r3, s28
-; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q7, #0xff
; CHECK-NEXT: vmov r4, s16
; CHECK-NEXT: umull r2, r5, r2, r5
; CHECK-NEXT: umull r3, r4, r3, r4
@@ -2439,7 +2431,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: vmov r5, s2
; CHECK-NEXT: vmov r2, s18
; CHECK-NEXT: vmov r3, s16
-; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q4, #0x0
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: umull r2, r5, r2, r5
; CHECK-NEXT: umull r3, r4, r3, r4
@@ -2520,7 +2512,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: vmov r5, s2
; CHECK-NEXT: vmov r2, s18
; CHECK-NEXT: vmov r3, s16
-; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q4, #0x0
; CHECK-NEXT: vmov r4, s0
; CHECK-NEXT: umull r2, r5, r2, r5
; CHECK-NEXT: umull r3, r4, r3, r4
@@ -2541,7 +2533,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: vmov.u16 r4, q6[5]
; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
; CHECK-NEXT: vcmp.i32 ne, q0, zr
-; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q0, #0xff
; CHECK-NEXT: vpsel q0, q0, q4
; CHECK-NEXT: vmov r5, r4, d0
; CHECK-NEXT: vmov q4[2], q4[0], r5, r4
@@ -2598,7 +2590,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: adcs r3, r4
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
-; CHECK-NEXT: add sp, #32
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: pop {r4, r5, r7, pc}
entry:
@@ -2619,14 +2610,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: push {r4, r5, r7, lr}
; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15}
-; CHECK-NEXT: .pad #16
-; CHECK-NEXT: sub sp, #16
; CHECK-NEXT: vmov q3, q0
; CHECK-NEXT: vcmp.i8 eq, q2, zr
; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vmov.i8 q2, #0xff
; CHECK-NEXT: vpsel q5, q2, q0
-; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill
+; CHECK-NEXT: vmov.i8 q0, #0x0
; CHECK-NEXT: vmov.u8 r2, q5[0]
; CHECK-NEXT: vmov.s8 r4, q1[2]
; CHECK-NEXT: vmov.16 q4[0], r2
@@ -2676,7 +2665,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: adc.w r12, r12, r2
; CHECK-NEXT: vmov r2, r3, d15
; CHECK-NEXT: vmov q0[2], q0[0], r2, r3
-; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload
+; CHECK-NEXT: vmov.i8 q7, #0x0
; CHECK-NEXT: vmov q0[3], q0[1], r2, r3
; CHECK-NEXT: vmov.s8 r2, q1[3]
; CHECK-NEXT: vmov.s8 r3, q3[3]
@@ -2701,7 +2690,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vcmp.i32 ne, q0, zr
-; CHECK-NEXT: vpsel q6, q2, q7
+; CHECK-NEXT: vmov.i8 q0, #0x0
+; CHECK-NEXT: vpsel q6, q2, q0
; CHECK-NEXT: vmov r5, r4, d12
; CHECK-NEXT: vmov q0[2], q0[0], r5, r4
; CHECK-NEXT: vmov q0[3], q0[1], r5, r4
@@ -2755,17 +2745,18 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: vmov.16 q6[7], r5
; CHECK-NEXT: vmov.s8 r2, q1[8]
; CHECK-NEXT: vcmp.i16 ne, q6, zr
+; CHECK-NEXT: vmov.i8 q6, #0x0
+; CHECK-NEXT: vpsel q5, q2, q6
; CHECK-NEXT: vmov.s8 r3, q3[8]
-; CHECK-NEXT: vpsel q5, q2, q7
-; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vmov.u16 r5, q5[2]
; CHECK-NEXT: vmov.u16 r4, q5[0]
; CHECK-NEXT: vmov q0[2], q0[0], r4, r5
; CHECK-NEXT: vmov.u16 r5, q5[3]
; CHECK-NEXT: vmov.u16 r4, q5[1]
+; CHECK-NEXT: smull r2, r3, r3, r2
; CHECK-NEXT: vmov q0[3], q0[1], r4, r5
; CHECK-NEXT: vcmp.i32 ne, q0, zr
-; CHECK-NEXT: vpsel q6, q2, q7
+; CHECK-NEXT: vpsel q6, q2, q6
; CHECK-NEXT: vmov r5, r4, d12
; CHECK-NEXT: vmov q0[2], q0[0], r5, r4
; CHECK-NEXT: vmov q0[3], q0[1], r5, r4
@@ -2849,7 +2840,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y
; CHECK-NEXT: adcs r3, r4
; CHECK-NEXT: adds r0, r0, r2
; CHECK-NEXT: adcs r1, r3
-; CHECK-NEXT: add sp, #16
; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15}
; CHECK-NEXT: pop {r4, r5, r7, pc}
entry:
diff --git a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll
index 0c349c3..cba394f 100644
--- a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll
+++ b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll
@@ -59,18 +59,18 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row
; CHECK-NEXT: [[MOVSX64rm32_:%[0-9]+]]:gr64_nosp = MOVSX64rm32 %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 8)
; CHECK-NEXT: [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[MOV32rm2]].sub_32bit
; CHECK-NEXT: [[MOVSX64rm32_1:%[0-9]+]]:gr64 = MOVSX64rm32 %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1, align 16)
- ; CHECK-NEXT: MOV64mr %stack.5, 1, $noreg, 0, $noreg, [[MOVSX64rm32_1]] :: (store (s64) into %stack.5)
; CHECK-NEXT: [[MOVSX64rr32_4:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm1]]
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr32 = COPY [[MOV32rm3]]
- ; CHECK-NEXT: [[MOVSX64rr32_5:%[0-9]+]]:gr64 = MOVSX64rr32 [[COPY6]]
+ ; CHECK-NEXT: [[MOVSX64rr32_5:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm3]]
; CHECK-NEXT: [[MOVSX64rr32_6:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm]]
; CHECK-NEXT: MOV64mr %stack.8, 1, $noreg, 0, $noreg, [[MOVSX64rr32_6]] :: (store (s64) into %stack.8)
+ ; CHECK-NEXT: MOV64mr %stack.5, 1, $noreg, 0, $noreg, [[COPY1]] :: (store (s64) into %stack.5)
; CHECK-NEXT: MOV64mr %stack.6, 1, $noreg, 0, $noreg, [[MOVSX64rr32_4]] :: (store (s64) into %stack.6)
- ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_nosp = COPY [[MOVSX64rr32_4]]
- ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_nosp = IMUL64rr [[COPY7]], [[MOVSX64rr32_2]], implicit-def dead $eflags
- ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_nosp = ADD64rr [[COPY7]], [[MOVSX64rm32_]], implicit-def dead $eflags
- ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r [[COPY]], 4, [[COPY7]], 0, $noreg
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64_nosp = COPY [[MOVSX64rr32_4]]
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64_nosp = IMUL64rr [[COPY6]], [[MOVSX64rr32_2]], implicit-def dead $eflags
+ ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64_nosp = ADD64rr [[COPY6]], [[MOVSX64rm32_]], implicit-def dead $eflags
+ ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r [[COPY]], 4, [[COPY6]], 0, $noreg
; CHECK-NEXT: MOV64mr %stack.9, 1, $noreg, 0, $noreg, [[LEA64r]] :: (store (s64) into %stack.9)
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_with_sub_8bit = COPY [[MOV32rm2]]
; CHECK-NEXT: MOV64mr %stack.7, 1, $noreg, 0, $noreg, [[MOVSX64rr32_5]] :: (store (s64) into %stack.7)
; CHECK-NEXT: [[COPY8:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_5]]
; CHECK-NEXT: [[COPY8:%[0-9]+]]:gr64 = IMUL64rr [[COPY8]], [[MOVSX64rr32_2]], implicit-def dead $eflags
@@ -87,8 +87,11 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[MOV32rm4:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 8)
; CHECK-NEXT: CMP32rm [[MOV32rm4]], %fixed-stack.1, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s32) from %fixed-stack.1, align 16)
- ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3)
- ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5)
+ ; CHECK-NEXT: [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm %stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %stack.2)
+ ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64_nosp = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3)
+ ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11)
+ ; CHECK-NEXT: [[MOV32rm6:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.3, align 16)
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr32 = COPY [[MOV32rm6]]
; CHECK-NEXT: JCC_1 %bb.5, 13, implicit $eflags
; CHECK-NEXT: JMP_1 %bb.3
; CHECK-NEXT: {{ $}}
@@ -98,9 +101,8 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row
; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6)
; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = nsw IMUL64rr [[MOV64rm2]], [[MOVSX64rr32_]], implicit-def dead $eflags
; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = ADD64rm [[MOV64rm2]], %stack.1, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.1)
- ; CHECK-NEXT: MOV64mr %stack.13, 1, $noreg, 0, $noreg, [[MOV64rm2]] :: (store (s64) into %stack.13)
- ; CHECK-NEXT: [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm %stack.12, 1, $noreg, 0, $noreg :: (load (s32) from %stack.12)
- ; CHECK-NEXT: undef [[COPY9:%[0-9]+]].sub_32bit:gr64_nosp = COPY [[MOV32rm5]]
+ ; CHECK-NEXT: [[MOV32rm7:%[0-9]+]]:gr32 = MOV32rm %stack.12, 1, $noreg, 0, $noreg :: (load (s32) from %stack.12)
+ ; CHECK-NEXT: undef [[COPY10:%[0-9]+]].sub_32bit:gr64_nosp = COPY [[MOV32rm7]]
; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = MOV64rm %stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %stack.9)
; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64 = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4)
; CHECK-NEXT: JMP_1 %bb.6
@@ -123,40 +125,30 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row
; CHECK-NEXT: bb.6.for.body17:
; CHECK-NEXT: successors: %bb.6(0x7c000000), %bb.5(0x04000000)
; CHECK-NEXT: {{ $}}
- ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY6]].sub_16bit, [[MOV32rm2]].sub_16bit
- ; CHECK-NEXT: [[MOV64rm7:%[0-9]+]]:gr64 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load (s64) from %stack.13)
- ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[COPY6]].sub_16bit, [[COPY4]].sub_16bit, [[MOV64rm7]], 1, [[MOVSX64rr32_]], 0, $noreg
- ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[COPY9]].sub_32bit
- ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gr32 = COPY [[LEA64_32r1]]
- ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gr64 = COPY [[MOV64rm1]]
- ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gr32 = COPY [[COPY4]]
- ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gr32 = COPY [[COPY6]]
- ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_3]]
- ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_2]]
- ; CHECK-NEXT: [[COPY16:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_]]
- ; CHECK-NEXT: [[COPY17:%[0-9]+]]:gr64 = COPY [[MOV32rm2]]
- ; CHECK-NEXT: [[COPY18:%[0-9]+]]:gr64 = COPY [[COPY1]]
- ; CHECK-NEXT: [[LEA64r2:%[0-9]+]]:gr64 = LEA64r [[COPY18]], 1, [[COPY9]], 0, $noreg
- ; CHECK-NEXT: [[COPY19:%[0-9]+]]:gr64_nosp = COPY [[MOV64rm]]
- ; CHECK-NEXT: [[MOV32rm6:%[0-9]+]]:gr32 = MOV32rm %stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %stack.2)
- ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm6]].sub_16bit, [[COPY10]].sub_16bit, [[LEA64r2]], 1, [[COPY19]], 0, $noreg
- ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = COPY [[COPY19]]
- ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY [[COPY18]]
- ; CHECK-NEXT: [[MOV32rm2:%[0-9]+]]:gr64_nosp = COPY [[COPY17]]
- ; CHECK-NEXT: [[MOVSX64rr32_:%[0-9]+]]:gr64_nosp = COPY [[COPY16]]
- ; CHECK-NEXT: [[MOVSX64rr32_2:%[0-9]+]]:gr64_nosp = COPY [[COPY15]]
- ; CHECK-NEXT: [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = COPY [[COPY14]]
- ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr32 = COPY [[COPY13]]
- ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr32 = COPY [[COPY12]]
- ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = COPY [[COPY11]]
- ; CHECK-NEXT: [[MOV64rm8:%[0-9]+]]:gr64 = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11)
- ; CHECK-NEXT: [[LEA64_32r1:%[0-9]+]]:gr32 = COPY [[COPY10]]
- ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY6]].sub_16bit, [[LEA64_32r1]].sub_16bit, [[COPY4]].sub_16bit, [[PTILEZEROV]], [[PTILELOADDV]], [[PTILELOADDV1]]
- ; CHECK-NEXT: PTILESTOREDV [[COPY6]].sub_16bit, [[MOV32rm2]].sub_16bit, [[MOV64rm3]], 1, [[MOVSX64rr32_2]], 0, $noreg, [[PTILEZEROV]]
+ ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY9]].sub_16bit, [[COPY7]].sub_16bit
+ ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[COPY9]].sub_16bit, [[COPY4]].sub_16bit, [[MOV64rm2]], 1, [[MOVSX64rr32_]], 0, $noreg
+ ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[COPY10]].sub_32bit
+ ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gr64 = COPY [[MOVSX64rm32_1]]
+ ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_3]]
+ ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_2]]
+ ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_]]
+ ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gr64 = COPY [[COPY7]]
+ ; CHECK-NEXT: [[MOV64rm7:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5)
+ ; CHECK-NEXT: [[LEA64r2:%[0-9]+]]:gr64 = LEA64r [[MOV64rm7]], 1, [[COPY10]], 0, $noreg
+ ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm5]].sub_16bit, [[LEA64_32r1]].sub_16bit, [[LEA64r2]], 1, [[MOV64rm]], 0, $noreg
+ ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_with_sub_8bit = COPY [[COPY15]]
+ ; CHECK-NEXT: [[MOVSX64rr32_:%[0-9]+]]:gr64_nosp = COPY [[COPY14]]
+ ; CHECK-NEXT: [[MOVSX64rr32_2:%[0-9]+]]:gr64_nosp = COPY [[COPY13]]
+ ; CHECK-NEXT: [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = COPY [[COPY12]]
+ ; CHECK-NEXT: [[MOVSX64rm32_1:%[0-9]+]]:gr64 = COPY [[COPY11]]
+ ; CHECK-NEXT: [[MOV32rm8:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.3, align 16)
+ ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr32 = COPY [[MOV32rm8]]
+ ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY9]].sub_16bit, [[LEA64_32r1]].sub_16bit, [[COPY4]].sub_16bit, [[PTILEZEROV]], [[PTILELOADDV]], [[PTILELOADDV1]]
+ ; CHECK-NEXT: PTILESTOREDV [[COPY9]].sub_16bit, [[COPY7]].sub_16bit, [[MOV64rm3]], 1, [[MOVSX64rr32_2]], 0, $noreg, [[PTILEZEROV]]
; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm4]], [[MOVSX64rr32_3]], implicit-def dead $eflags
- ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm3]], [[MOV64rm8]], implicit-def dead $eflags
- ; CHECK-NEXT: [[COPY9:%[0-9]+]].sub_32bit:gr64_nosp = ADD32rr [[COPY9]].sub_32bit, [[LEA64_32r1]], implicit-def dead $eflags
- ; CHECK-NEXT: CMP64rr [[MOV64rm4]], [[MOV64rm1]], implicit-def $eflags
+ ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm3]], [[MOV64rm1]], implicit-def dead $eflags
+ ; CHECK-NEXT: [[COPY10:%[0-9]+]].sub_32bit:gr64_nosp = ADD32rr [[COPY10]].sub_32bit, [[LEA64_32r1]], implicit-def dead $eflags
+ ; CHECK-NEXT: CMP64rr [[MOV64rm4]], [[MOVSX64rm32_1]], implicit-def $eflags
; CHECK-NEXT: JCC_1 %bb.6, 12, implicit $eflags
; CHECK-NEXT: JMP_1 %bb.5
entry:
diff --git a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
index bf6b096..b428ce4 100644
--- a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
+++ b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
@@ -136,8 +136,6 @@ define void @_Z2x6v() local_unnamed_addr {
; CHECK-NEXT: movl (%r8), %edx
; CHECK-NEXT: leal 8(,%rbx,8), %eax
; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; CHECK-NEXT: leaq 8(%rsi), %rax
-; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; CHECK-NEXT: leaq 32(%rsi), %r11
; CHECK-NEXT: leaq 8(,%rbx,8), %rbx
; CHECK-NEXT: xorl %r14d, %r14d
@@ -189,7 +187,8 @@ define void @_Z2x6v() local_unnamed_addr {
; CHECK-NEXT: jae .LBB1_7
; CHECK-NEXT: # %bb.6: # %vector.memcheck
; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1
-; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
+; CHECK-NEXT: leaq 8(%rsi), %r9
+; CHECK-NEXT: addq %r9, %rax
; CHECK-NEXT: leaq (%rax,%r10,8), %rax
; CHECK-NEXT: cmpq %r15, %rax
; CHECK-NEXT: ja .LBB1_14
diff --git a/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir b/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir
index 10ee445..d355374 100644
--- a/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir
+++ b/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir
@@ -7,8 +7,8 @@
# CHECK: jne
# CHECK: andl $-16, %edx
# CHECK: xorl %ebx, %ebx
-# CHECK: movl -16(%ebp), %esi
-# CHECK: xorl %eax, %eax
+# CHECK: xorl %esi, %esi
+# CHECK: movl %eax, %ecx
name: test
tracksRegLiveness: true
diff --git a/llvm/test/CodeGen/X86/inalloca-invoke.ll b/llvm/test/CodeGen/X86/inalloca-invoke.ll
index c2728f7..68cb24d 100644
--- a/llvm/test/CodeGen/X86/inalloca-invoke.ll
+++ b/llvm/test/CodeGen/X86/inalloca-invoke.ll
@@ -23,7 +23,6 @@ blah:
; CHECK: pushl %eax
; CHECK: subl $20, %esp
; CHECK: movl %esp, %[[beg:[^ ]*]]
-; CHECK: leal 12(%[[beg]]), %[[end:[^ ]*]]
call void @begin(ptr sret(%Iter) %temp.lvalue)
; CHECK: calll _begin
@@ -32,6 +31,7 @@ blah:
to label %invoke.cont unwind label %lpad
; Uses end as sret param.
+; CHECK: leal 12(%[[beg]]), %[[end:[^ ]*]]
; CHECK: pushl %[[end]]
; CHECK: calll _plus
diff --git a/llvm/test/CodeGen/X86/licm-regpressure.ll b/llvm/test/CodeGen/X86/licm-regpressure.ll
index 72a4832..26ed2a3 100644
--- a/llvm/test/CodeGen/X86/licm-regpressure.ll
+++ b/llvm/test/CodeGen/X86/licm-regpressure.ll
@@ -1,14 +1,64 @@
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s
-; This tests currently fails as MachineLICM does not compute register pressure
-; correctly. More details: llvm.org/PR23143
-; XFAIL: *
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6
+; RUN: llc < %s -mtriple=x86_64-linux -stop-after=early-machinelicm -o - | FileCheck %s
-; MachineLICM should take register pressure into account.
-; CHECK-NOT: Spill
+; FIXME: MachineLICM does not compute register pressure correctly and we end up
+; emitting too many ADD64ri32s. More details: llvm.org/PR23143
%struct.A = type { i32, i32, i32, i32, i32, i32, i32 }
define void @test(i1 %b, ptr %a) nounwind {
+ ; CHECK-LABEL: name: test
+ ; CHECK: bb.0.entry:
+ ; CHECK-NEXT: successors: %bb.1(0x80000000)
+ ; CHECK-NEXT: liveins: $edi, $rsi
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rsi
+ ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $edi
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr8 = COPY [[COPY1]].sub_8bit
+ ; CHECK-NEXT: [[ADD64ri32_:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 4, implicit-def dead $eflags
+ ; CHECK-NEXT: [[ADD64ri32_1:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 8, implicit-def dead $eflags
+ ; CHECK-NEXT: [[ADD64ri32_2:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 12, implicit-def dead $eflags
+ ; CHECK-NEXT: [[ADD64ri32_3:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 16, implicit-def dead $eflags
+ ; CHECK-NEXT: [[ADD64ri32_4:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 20, implicit-def dead $eflags
+ ; CHECK-NEXT: [[ADD64ri32_5:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 24, implicit-def dead $eflags
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.1.loop-body:
+ ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000)
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: $rdi = COPY [[COPY]]
+ ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+ ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: $rdi = COPY [[ADD64ri32_]]
+ ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+ ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: $rdi = COPY [[ADD64ri32_1]]
+ ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+ ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: $rdi = COPY [[ADD64ri32_2]]
+ ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+ ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: $rdi = COPY [[ADD64ri32_3]]
+ ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+ ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: $rdi = COPY [[ADD64ri32_4]]
+ ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+ ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: $rdi = COPY [[ADD64ri32_5]]
+ ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp
+ ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp
+ ; CHECK-NEXT: TEST8ri [[COPY2]], 1, implicit-def $eflags
+ ; CHECK-NEXT: JCC_1 %bb.1, 5, implicit $eflags
+ ; CHECK-NEXT: JMP_1 %bb.2
+ ; CHECK-NEXT: {{ $}}
+ ; CHECK-NEXT: bb.2.loop-exit:
+ ; CHECK-NEXT: RET 0
entry:
br label %loop-header