diff options
Diffstat (limited to 'llvm/test')
34 files changed, 1549 insertions, 1489 deletions
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll index ed68723..41f7ab8 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll @@ -1219,14 +1219,14 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; ; GISEL-LABEL: test_shl_i1024: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: sub sp, sp, #416 -; GISEL-NEXT: stp x28, x27, [sp, #320] ; 16-byte Folded Spill -; GISEL-NEXT: stp x26, x25, [sp, #336] ; 16-byte Folded Spill -; GISEL-NEXT: stp x24, x23, [sp, #352] ; 16-byte Folded Spill -; GISEL-NEXT: stp x22, x21, [sp, #368] ; 16-byte Folded Spill -; GISEL-NEXT: stp x20, x19, [sp, #384] ; 16-byte Folded Spill -; GISEL-NEXT: stp x29, x30, [sp, #400] ; 16-byte Folded Spill -; GISEL-NEXT: .cfi_def_cfa_offset 416 +; GISEL-NEXT: sub sp, sp, #432 +; GISEL-NEXT: stp x28, x27, [sp, #336] ; 16-byte Folded Spill +; GISEL-NEXT: stp x26, x25, [sp, #352] ; 16-byte Folded Spill +; GISEL-NEXT: stp x24, x23, [sp, #368] ; 16-byte Folded Spill +; GISEL-NEXT: stp x22, x21, [sp, #384] ; 16-byte Folded Spill +; GISEL-NEXT: stp x20, x19, [sp, #400] ; 16-byte Folded Spill +; GISEL-NEXT: stp x29, x30, [sp, #416] ; 16-byte Folded Spill +; GISEL-NEXT: .cfi_def_cfa_offset 432 ; GISEL-NEXT: .cfi_offset w30, -8 ; GISEL-NEXT: .cfi_offset w29, -16 ; GISEL-NEXT: .cfi_offset w19, -24 @@ -1242,38 +1242,44 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: ldp x10, x11, [x1] ; GISEL-NEXT: mov w8, w2 ; GISEL-NEXT: lsr x9, x8, #6 -; GISEL-NEXT: and x16, x8, #0x3f +; GISEL-NEXT: and x12, x8, #0x3f +; GISEL-NEXT: str x0, [sp, #144] ; 8-byte Folded Spill +; GISEL-NEXT: and x14, x8, #0x3f ; GISEL-NEXT: mov w13, #64 ; =0x40 -; GISEL-NEXT: sub x21, x13, x16 -; GISEL-NEXT: str x0, [sp, #112] ; 8-byte Folded Spill -; GISEL-NEXT: mov x24, x16 -; GISEL-NEXT: lsl x25, x10, x16 +; GISEL-NEXT: and x16, x8, #0x3f +; GISEL-NEXT: lsl x0, x10, x12 ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: lsr x26, x10, x21 -; GISEL-NEXT: lsl x2, x11, x16 -; GISEL-NEXT: lsr x23, x11, x21 -; GISEL-NEXT: mov x22, x21 -; GISEL-NEXT: csel x12, x25, xzr, eq +; GISEL-NEXT: sub x2, x13, x14 +; GISEL-NEXT: lsr x3, x10, x2 +; GISEL-NEXT: lsl x6, x11, x14 +; GISEL-NEXT: and x14, x8, #0x3f +; GISEL-NEXT: csel x12, x0, xzr, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: str x1, [sp, #312] ; 8-byte Folded Spill +; GISEL-NEXT: lsr x20, x11, x2 ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: str x23, [sp, #208] ; 8-byte Folded Spill +; GISEL-NEXT: mov x24, x0 ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #3 -; GISEL-NEXT: stp x24, x22, [sp, #40] ; 16-byte Folded Spill +; GISEL-NEXT: mov x7, x3 ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #4 +; GISEL-NEXT: mov x28, x1 ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #5 +; GISEL-NEXT: and x21, x8, #0x3f ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #6 +; GISEL-NEXT: str x6, [sp, #24] ; 8-byte Folded Spill ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #7 +; GISEL-NEXT: str x28, [sp, #304] ; 8-byte Folded Spill ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #8 +; GISEL-NEXT: str x7, [sp, #272] ; 8-byte Folded Spill ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: str x20, [sp, #112] ; 8-byte Folded Spill ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #10 ; GISEL-NEXT: csel x12, xzr, x12, eq @@ -1290,13 +1296,13 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: cmp x8, #0 ; GISEL-NEXT: csel x10, x10, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: str x10, [sp, #192] ; 8-byte Folded Spill -; GISEL-NEXT: csel x10, xzr, x26, eq +; GISEL-NEXT: str x10, [sp, #232] ; 8-byte Folded Spill +; GISEL-NEXT: csel x10, xzr, x3, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: orr x10, x2, x10 +; GISEL-NEXT: orr x10, x6, x10 ; GISEL-NEXT: csel x10, x10, xzr, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: csel x10, x25, x10, eq +; GISEL-NEXT: csel x10, x0, x10, eq ; GISEL-NEXT: cmp x9, #2 ; GISEL-NEXT: csel x10, xzr, x10, eq ; GISEL-NEXT: cmp x9, #3 @@ -1327,25 +1333,24 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: cmp x9, #15 ; GISEL-NEXT: csel x13, xzr, x13, eq ; GISEL-NEXT: cmp x8, #0 -; GISEL-NEXT: lsl x20, x12, x16 +; GISEL-NEXT: lsl x26, x12, x14 ; GISEL-NEXT: csel x11, x11, x13, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: str x11, [sp, #184] ; 8-byte Folded Spill -; GISEL-NEXT: csel x11, xzr, x23, eq +; GISEL-NEXT: str x11, [sp, #224] ; 8-byte Folded Spill +; GISEL-NEXT: csel x11, xzr, x20, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: orr x11, x20, x11 -; GISEL-NEXT: lsr x15, x12, x21 -; GISEL-NEXT: lsl x14, x10, x16 +; GISEL-NEXT: orr x11, x26, x11 +; GISEL-NEXT: lsr x15, x12, x2 +; GISEL-NEXT: lsl x30, x10, x16 ; GISEL-NEXT: csel x11, x11, xzr, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: lsr x17, x10, x21 -; GISEL-NEXT: csel x13, xzr, x26, eq +; GISEL-NEXT: lsr x17, x10, x2 +; GISEL-NEXT: csel x13, xzr, x3, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: str x20, [sp, #8] ; 8-byte Folded Spill -; GISEL-NEXT: orr x13, x2, x13 +; GISEL-NEXT: orr x13, x6, x13 ; GISEL-NEXT: csel x11, x13, x11, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: csel x11, x25, x11, eq +; GISEL-NEXT: csel x11, x0, x11, eq ; GISEL-NEXT: cmp x9, #3 ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #4 @@ -1375,23 +1380,23 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: cmp x8, #0 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: str x11, [sp, #176] ; 8-byte Folded Spill +; GISEL-NEXT: str x11, [sp, #216] ; 8-byte Folded Spill ; GISEL-NEXT: csel x11, xzr, x15, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: orr x11, x14, x11 +; GISEL-NEXT: orr x11, x30, x11 ; GISEL-NEXT: csel x11, x11, xzr, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x23, eq +; GISEL-NEXT: csel x12, xzr, x20, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: orr x12, x20, x12 +; GISEL-NEXT: orr x12, x26, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x26, eq +; GISEL-NEXT: csel x12, xzr, x3, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: orr x12, x2, x12 +; GISEL-NEXT: orr x12, x6, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: cmp x9, #3 -; GISEL-NEXT: csel x11, x25, x11, eq +; GISEL-NEXT: csel x11, x0, x11, eq ; GISEL-NEXT: cmp x9, #4 ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #5 @@ -1421,33 +1426,33 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: lsl x0, x12, x16 ; GISEL-NEXT: csel x10, x10, x13, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: str x10, [sp, #168] ; 8-byte Folded Spill +; GISEL-NEXT: str x10, [sp, #208] ; 8-byte Folded Spill ; GISEL-NEXT: csel x10, xzr, x17, eq ; GISEL-NEXT: cmp x9, #0 ; GISEL-NEXT: orr x10, x0, x10 -; GISEL-NEXT: lsr x27, x12, x21 +; GISEL-NEXT: lsr x4, x12, x2 ; GISEL-NEXT: lsl x19, x11, x16 ; GISEL-NEXT: csel x10, x10, xzr, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: lsr x3, x11, x21 +; GISEL-NEXT: mov x16, x15 ; GISEL-NEXT: csel x13, xzr, x15, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: stp x27, x0, [sp, #240] ; 16-byte Folded Spill -; GISEL-NEXT: orr x13, x14, x13 -; GISEL-NEXT: mov x7, x3 +; GISEL-NEXT: str x4, [sp, #248] ; 8-byte Folded Spill +; GISEL-NEXT: orr x13, x30, x13 +; GISEL-NEXT: str x0, [sp, #48] ; 8-byte Folded Spill ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x23, eq +; GISEL-NEXT: csel x13, xzr, x20, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: orr x13, x20, x13 +; GISEL-NEXT: orr x13, x26, x13 ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x26, eq +; GISEL-NEXT: csel x13, xzr, x3, eq ; GISEL-NEXT: cmp x9, #3 -; GISEL-NEXT: orr x13, x2, x13 +; GISEL-NEXT: orr x13, x6, x13 ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: cmp x9, #4 -; GISEL-NEXT: csel x10, x25, x10, eq +; GISEL-NEXT: csel x10, x24, x10, eq ; GISEL-NEXT: cmp x9, #5 ; GISEL-NEXT: csel x10, xzr, x10, eq ; GISEL-NEXT: cmp x9, #6 @@ -1473,8 +1478,8 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: cmp x8, #0 ; GISEL-NEXT: csel x10, x12, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: str x10, [sp, #160] ; 8-byte Folded Spill -; GISEL-NEXT: csel x10, xzr, x27, eq +; GISEL-NEXT: str x10, [sp, #200] ; 8-byte Folded Spill +; GISEL-NEXT: csel x10, xzr, x4, eq ; GISEL-NEXT: cmp x9, #0 ; GISEL-NEXT: orr x10, x19, x10 ; GISEL-NEXT: csel x10, x10, xzr, eq @@ -1486,20 +1491,22 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: csel x12, xzr, x15, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: orr x12, x14, x12 +; GISEL-NEXT: and x15, x8, #0x3f +; GISEL-NEXT: orr x12, x30, x12 ; GISEL-NEXT: csel x10, x12, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x23, eq +; GISEL-NEXT: csel x12, xzr, x20, eq ; GISEL-NEXT: cmp x9, #3 -; GISEL-NEXT: orr x12, x20, x12 +; GISEL-NEXT: orr x12, x26, x12 ; GISEL-NEXT: csel x10, x12, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x26, eq +; GISEL-NEXT: csel x12, xzr, x3, eq ; GISEL-NEXT: cmp x9, #4 -; GISEL-NEXT: orr x12, x2, x12 +; GISEL-NEXT: lsr x3, x11, x2 +; GISEL-NEXT: orr x12, x6, x12 ; GISEL-NEXT: csel x10, x12, x10, eq ; GISEL-NEXT: cmp x9, #5 -; GISEL-NEXT: csel x10, x25, x10, eq +; GISEL-NEXT: csel x10, x24, x10, eq ; GISEL-NEXT: cmp x9, #6 ; GISEL-NEXT: csel x10, xzr, x10, eq ; GISEL-NEXT: cmp x9, #7 @@ -1522,21 +1529,23 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: cmp x9, #15 ; GISEL-NEXT: csel x13, xzr, x13, eq ; GISEL-NEXT: cmp x8, #0 -; GISEL-NEXT: lsl x4, x12, x16 +; GISEL-NEXT: lsl x22, x12, x15 ; GISEL-NEXT: csel x11, x11, x13, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: str x11, [sp, #152] ; 8-byte Folded Spill +; GISEL-NEXT: str x11, [sp, #192] ; 8-byte Folded Spill ; GISEL-NEXT: csel x11, xzr, x3, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: orr x11, x4, x11 -; GISEL-NEXT: lsl x30, x10, x16 -; GISEL-NEXT: lsr x28, x10, x21 +; GISEL-NEXT: orr x11, x22, x11 +; GISEL-NEXT: lsl x5, x10, x15 +; GISEL-NEXT: lsr x27, x10, x2 ; GISEL-NEXT: csel x11, x11, xzr, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x27, eq +; GISEL-NEXT: csel x13, xzr, x4, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: str x30, [sp, #200] ; 8-byte Folded Spill +; GISEL-NEXT: mov x25, x27 ; GISEL-NEXT: orr x13, x19, x13 +; GISEL-NEXT: mov x14, x5 +; GISEL-NEXT: str x27, [sp, #328] ; 8-byte Folded Spill ; GISEL-NEXT: csel x11, x13, x11, eq ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: csel x13, xzr, x17, eq @@ -1544,30 +1553,29 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: orr x13, x0, x13 ; GISEL-NEXT: csel x11, x13, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x15, eq +; GISEL-NEXT: csel x13, xzr, x16, eq ; GISEL-NEXT: cmp x9, #3 -; GISEL-NEXT: orr x13, x14, x13 +; GISEL-NEXT: orr x13, x30, x13 ; GISEL-NEXT: csel x11, x13, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x23, eq +; GISEL-NEXT: csel x13, xzr, x20, eq ; GISEL-NEXT: cmp x9, #4 -; GISEL-NEXT: orr x13, x20, x13 +; GISEL-NEXT: orr x13, x26, x13 ; GISEL-NEXT: csel x11, x13, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x26, eq +; GISEL-NEXT: csel x13, xzr, x7, eq ; GISEL-NEXT: cmp x9, #5 -; GISEL-NEXT: orr x13, x2, x13 +; GISEL-NEXT: orr x13, x6, x13 ; GISEL-NEXT: csel x11, x13, x11, eq ; GISEL-NEXT: cmp x9, #6 -; GISEL-NEXT: lsr x13, x12, x21 -; GISEL-NEXT: csel x11, x25, x11, eq +; GISEL-NEXT: lsr x13, x12, x2 +; GISEL-NEXT: csel x11, x24, x11, eq ; GISEL-NEXT: cmp x9, #7 ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #8 -; GISEL-NEXT: mov x6, x13 +; GISEL-NEXT: mov x15, x13 ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #9 -; GISEL-NEXT: str x6, [sp, #256] ; 8-byte Folded Spill ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #10 ; GISEL-NEXT: csel x11, xzr, x11, eq @@ -1584,18 +1592,18 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: cmp x8, #0 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: str x11, [sp, #144] ; 8-byte Folded Spill +; GISEL-NEXT: str x11, [sp, #184] ; 8-byte Folded Spill ; GISEL-NEXT: csel x11, xzr, x13, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: orr x11, x30, x11 +; GISEL-NEXT: orr x11, x5, x11 ; GISEL-NEXT: csel x11, x11, xzr, eq ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: csel x12, xzr, x3, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: orr x12, x4, x12 +; GISEL-NEXT: orr x12, x22, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x27, eq +; GISEL-NEXT: csel x12, xzr, x4, eq ; GISEL-NEXT: cmp x9, #2 ; GISEL-NEXT: orr x12, x19, x12 ; GISEL-NEXT: csel x11, x12, x11, eq @@ -1605,22 +1613,22 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: orr x12, x0, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x15, eq +; GISEL-NEXT: csel x12, xzr, x16, eq ; GISEL-NEXT: cmp x9, #4 -; GISEL-NEXT: orr x12, x14, x12 +; GISEL-NEXT: orr x12, x30, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x23, eq +; GISEL-NEXT: csel x12, xzr, x20, eq ; GISEL-NEXT: cmp x9, #5 -; GISEL-NEXT: orr x12, x20, x12 +; GISEL-NEXT: orr x12, x26, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x26, eq +; GISEL-NEXT: csel x12, xzr, x7, eq ; GISEL-NEXT: cmp x9, #6 -; GISEL-NEXT: orr x12, x2, x12 +; GISEL-NEXT: orr x12, x6, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: cmp x9, #7 -; GISEL-NEXT: csel x11, x25, x11, eq +; GISEL-NEXT: csel x11, x24, x11, eq ; GISEL-NEXT: cmp x9, #8 ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #9 @@ -1635,39 +1643,34 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #14 ; GISEL-NEXT: csel x12, xzr, x11, eq -; GISEL-NEXT: ldp x11, x5, [x1, #64] +; GISEL-NEXT: ldp x11, x1, [x1, #64] ; GISEL-NEXT: cmp x9, #15 ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x8, #0 ; GISEL-NEXT: csel x12, x10, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: lsl x21, x11, x16 -; GISEL-NEXT: str x12, [sp, #136] ; 8-byte Folded Spill -; GISEL-NEXT: csel x12, xzr, x28, eq +; GISEL-NEXT: lsl x23, x11, x21 +; GISEL-NEXT: str x12, [sp, #176] ; 8-byte Folded Spill +; GISEL-NEXT: csel x12, xzr, x27, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: orr x12, x21, x12 -; GISEL-NEXT: lsr x10, x11, x22 -; GISEL-NEXT: mov x16, x19 +; GISEL-NEXT: orr x12, x23, x12 +; GISEL-NEXT: lsr x21, x11, x2 +; GISEL-NEXT: str x23, [sp, #288] ; 8-byte Folded Spill ; GISEL-NEXT: csel x12, x12, xzr, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: mov x1, x16 ; GISEL-NEXT: csel x13, xzr, x13, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: str x16, [sp, #304] ; 8-byte Folded Spill -; GISEL-NEXT: orr x13, x30, x13 +; GISEL-NEXT: orr x13, x5, x13 ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: csel x13, xzr, x3, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: lsl x3, x5, x24 -; GISEL-NEXT: orr x13, x4, x13 +; GISEL-NEXT: orr x13, x22, x13 ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: stp x21, x3, [sp, #216] ; 16-byte Folded Spill -; GISEL-NEXT: csel x13, xzr, x27, eq +; GISEL-NEXT: csel x13, xzr, x4, eq ; GISEL-NEXT: cmp x9, #3 ; GISEL-NEXT: orr x13, x19, x13 -; GISEL-NEXT: mov x19, x28 ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: csel x13, xzr, x17, eq @@ -1675,27 +1678,30 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: orr x13, x0, x13 ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x15, eq +; GISEL-NEXT: csel x13, xzr, x16, eq ; GISEL-NEXT: cmp x9, #5 -; GISEL-NEXT: orr x13, x14, x13 +; GISEL-NEXT: orr x13, x30, x13 ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x23, eq +; GISEL-NEXT: csel x13, xzr, x20, eq ; GISEL-NEXT: cmp x9, #6 -; GISEL-NEXT: orr x13, x20, x13 +; GISEL-NEXT: orr x13, x26, x13 ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x26, eq +; GISEL-NEXT: csel x13, xzr, x7, eq ; GISEL-NEXT: cmp x9, #7 -; GISEL-NEXT: orr x13, x2, x13 +; GISEL-NEXT: orr x13, x6, x13 ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: cmp x9, #8 -; GISEL-NEXT: csel x12, x25, x12, eq +; GISEL-NEXT: and x13, x8, #0x3f +; GISEL-NEXT: csel x12, x24, x12, eq ; GISEL-NEXT: cmp x9, #9 +; GISEL-NEXT: lsl x10, x1, x13 ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #10 ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: stp x10, x15, [sp, #312] ; 16-byte Folded Spill ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #12 ; GISEL-NEXT: csel x12, xzr, x12, eq @@ -1708,69 +1714,69 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: cmp x8, #0 ; GISEL-NEXT: csel x11, x11, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: str x11, [sp, #128] ; 8-byte Folded Spill -; GISEL-NEXT: csel x11, xzr, x10, eq +; GISEL-NEXT: str x11, [sp, #168] ; 8-byte Folded Spill +; GISEL-NEXT: csel x11, xzr, x21, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: orr x11, x3, x11 +; GISEL-NEXT: orr x11, x10, x11 +; GISEL-NEXT: mov x10, x23 ; GISEL-NEXT: csel x11, x11, xzr, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x28, eq +; GISEL-NEXT: csel x12, xzr, x27, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: mov x28, x4 -; GISEL-NEXT: orr x12, x21, x12 -; GISEL-NEXT: str x28, [sp, #32] ; 8-byte Folded Spill +; GISEL-NEXT: mov x27, x24 +; GISEL-NEXT: orr x12, x23, x12 +; GISEL-NEXT: mov x23, x15 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x6, eq +; GISEL-NEXT: csel x12, xzr, x15, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: orr x12, x30, x12 +; GISEL-NEXT: mov x15, x22 +; GISEL-NEXT: orr x12, x5, x12 +; GISEL-NEXT: mov x5, x3 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x7, eq +; GISEL-NEXT: stp x14, x5, [sp, #256] ; 16-byte Folded Spill +; GISEL-NEXT: csel x12, xzr, x3, eq ; GISEL-NEXT: cmp x9, #3 -; GISEL-NEXT: orr x12, x4, x12 -; GISEL-NEXT: mov x4, x20 +; GISEL-NEXT: mov x5, x4 +; GISEL-NEXT: orr x12, x22, x12 +; GISEL-NEXT: lsr x22, x1, x2 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x27, eq +; GISEL-NEXT: csel x12, xzr, x4, eq ; GISEL-NEXT: cmp x9, #4 -; GISEL-NEXT: mov x27, x2 -; GISEL-NEXT: orr x12, x16, x12 -; GISEL-NEXT: mov x16, x17 +; GISEL-NEXT: str x22, [sp, #240] ; 8-byte Folded Spill +; GISEL-NEXT: orr x12, x19, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: csel x12, xzr, x17, eq ; GISEL-NEXT: cmp x9, #5 -; GISEL-NEXT: mov x17, x15 ; GISEL-NEXT: orr x12, x0, x12 -; GISEL-NEXT: lsr x0, x5, x22 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x15, eq +; GISEL-NEXT: csel x12, xzr, x16, eq ; GISEL-NEXT: cmp x9, #6 -; GISEL-NEXT: ldr x15, [sp, #312] ; 8-byte Folded Reload -; GISEL-NEXT: orr x12, x14, x12 -; GISEL-NEXT: str x0, [sp, #280] ; 8-byte Folded Spill +; GISEL-NEXT: orr x12, x30, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x23, eq +; GISEL-NEXT: csel x12, xzr, x20, eq ; GISEL-NEXT: cmp x9, #7 -; GISEL-NEXT: mov x23, x25 -; GISEL-NEXT: orr x12, x20, x12 -; GISEL-NEXT: str x23, [sp, #288] ; 8-byte Folded Spill +; GISEL-NEXT: orr x12, x26, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x26, eq +; GISEL-NEXT: csel x12, xzr, x7, eq ; GISEL-NEXT: cmp x9, #8 -; GISEL-NEXT: orr x12, x2, x12 -; GISEL-NEXT: mov x2, x3 +; GISEL-NEXT: mov x7, x14 +; GISEL-NEXT: orr x12, x6, x12 +; GISEL-NEXT: mov x6, x28 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: cmp x9, #9 -; GISEL-NEXT: csel x11, x25, x11, eq +; GISEL-NEXT: csel x11, x24, x11, eq ; GISEL-NEXT: cmp x9, #10 -; GISEL-NEXT: mov x25, x26 +; GISEL-NEXT: ldr x24, [x6, #88] ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #11 +; GISEL-NEXT: ldr x6, [sp, #272] ; 8-byte Folded Reload ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #12 ; GISEL-NEXT: csel x11, xzr, x11, eq @@ -1780,80 +1786,84 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #15 ; GISEL-NEXT: csel x12, xzr, x11, eq +; GISEL-NEXT: ldr x11, [x28, #80] ; GISEL-NEXT: cmp x8, #0 -; GISEL-NEXT: csel x12, x5, x12, eq -; GISEL-NEXT: ldp x11, x5, [x15, #80] +; GISEL-NEXT: csel x12, x1, x12, eq +; GISEL-NEXT: mov x28, x2 ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: str x12, [sp, #120] ; 8-byte Folded Spill -; GISEL-NEXT: mov x15, x7 -; GISEL-NEXT: csel x12, xzr, x0, eq +; GISEL-NEXT: lsl x2, x11, x13 +; GISEL-NEXT: str x12, [sp, #160] ; 8-byte Folded Spill +; GISEL-NEXT: csel x12, xzr, x22, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: str x15, [sp, #24] ; 8-byte Folded Spill -; GISEL-NEXT: lsl x20, x11, x24 -; GISEL-NEXT: orr x12, x20, x12 -; GISEL-NEXT: str x20, [sp, #232] ; 8-byte Folded Spill +; GISEL-NEXT: ldr x1, [sp, #312] ; 8-byte Folded Reload +; GISEL-NEXT: str x28, [sp, #16] ; 8-byte Folded Spill +; GISEL-NEXT: orr x12, x2, x12 +; GISEL-NEXT: str x2, [sp, #280] ; 8-byte Folded Spill ; GISEL-NEXT: csel x12, x12, xzr, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x10, eq +; GISEL-NEXT: csel x13, xzr, x21, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: orr x13, x3, x13 -; GISEL-NEXT: lsl x3, x5, x24 +; GISEL-NEXT: orr x13, x1, x13 ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x19, eq +; GISEL-NEXT: csel x13, xzr, x25, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: stp x19, x3, [sp, #264] ; 16-byte Folded Spill -; GISEL-NEXT: orr x13, x21, x13 +; GISEL-NEXT: mov x25, x16 +; GISEL-NEXT: orr x13, x10, x13 +; GISEL-NEXT: mov x10, x30 +; GISEL-NEXT: str x25, [sp, #80] ; 8-byte Folded Spill ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x6, eq +; GISEL-NEXT: csel x13, xzr, x23, eq ; GISEL-NEXT: cmp x9, #3 -; GISEL-NEXT: orr x13, x30, x13 +; GISEL-NEXT: mov x23, x3 +; GISEL-NEXT: orr x13, x14, x13 +; GISEL-NEXT: mov x14, x17 ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x7, eq -; GISEL-NEXT: ldp x7, x30, [sp, #240] ; 16-byte Folded Reload +; GISEL-NEXT: stp x19, x14, [sp, #64] ; 16-byte Folded Spill +; GISEL-NEXT: csel x13, xzr, x3, eq ; GISEL-NEXT: cmp x9, #4 -; GISEL-NEXT: orr x13, x28, x13 +; GISEL-NEXT: mov x3, x21 +; GISEL-NEXT: orr x13, x15, x13 +; GISEL-NEXT: str x3, [sp, #32] ; 8-byte Folded Spill ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x7, eq +; GISEL-NEXT: csel x13, xzr, x4, eq ; GISEL-NEXT: cmp x9, #5 -; GISEL-NEXT: orr x13, x1, x13 -; GISEL-NEXT: mov x1, x14 +; GISEL-NEXT: mov x4, x0 +; GISEL-NEXT: orr x13, x19, x13 ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x16, eq +; GISEL-NEXT: csel x13, xzr, x17, eq ; GISEL-NEXT: cmp x9, #6 -; GISEL-NEXT: orr x13, x30, x13 +; GISEL-NEXT: mov x17, x27 +; GISEL-NEXT: orr x13, x0, x13 +; GISEL-NEXT: ldr x0, [sp, #24] ; 8-byte Folded Reload ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x17, eq +; GISEL-NEXT: csel x13, xzr, x16, eq ; GISEL-NEXT: cmp x9, #7 -; GISEL-NEXT: orr x13, x14, x13 -; GISEL-NEXT: ldr x14, [sp, #208] ; 8-byte Folded Reload +; GISEL-NEXT: orr x13, x30, x13 +; GISEL-NEXT: ldp x30, x16, [sp, #320] ; 16-byte Folded Reload ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x14, eq +; GISEL-NEXT: csel x13, xzr, x20, eq ; GISEL-NEXT: cmp x9, #8 -; GISEL-NEXT: orr x13, x4, x13 -; GISEL-NEXT: mov x4, x10 +; GISEL-NEXT: orr x13, x26, x13 ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x26, eq +; GISEL-NEXT: csel x13, xzr, x6, eq ; GISEL-NEXT: cmp x9, #9 -; GISEL-NEXT: mov x26, x27 -; GISEL-NEXT: orr x13, x27, x13 -; GISEL-NEXT: lsr x27, x11, x22 +; GISEL-NEXT: orr x13, x0, x13 ; GISEL-NEXT: csel x12, x13, x12, eq ; GISEL-NEXT: cmp x9, #10 -; GISEL-NEXT: mov x13, x23 -; GISEL-NEXT: csel x12, x23, x12, eq +; GISEL-NEXT: lsr x13, x11, x28 +; GISEL-NEXT: csel x12, x27, x12, eq ; GISEL-NEXT: cmp x9, #11 -; GISEL-NEXT: str x27, [sp, #64] ; 8-byte Folded Spill ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #12 -; GISEL-NEXT: mov x23, x20 +; GISEL-NEXT: str x13, [sp, #96] ; 8-byte Folded Spill ; GISEL-NEXT: csel x12, xzr, x12, eq ; GISEL-NEXT: cmp x9, #13 ; GISEL-NEXT: csel x12, xzr, x12, eq @@ -1864,71 +1874,77 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: cmp x8, #0 ; GISEL-NEXT: csel x11, x11, x12, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: str x11, [sp, #104] ; 8-byte Folded Spill -; GISEL-NEXT: csel x11, xzr, x27, eq +; GISEL-NEXT: str x11, [sp, #152] ; 8-byte Folded Spill +; GISEL-NEXT: and x11, x8, #0x3f +; GISEL-NEXT: lsl x27, x24, x11 +; GISEL-NEXT: csel x11, xzr, x13, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: orr x11, x3, x11 +; GISEL-NEXT: orr x11, x27, x11 +; GISEL-NEXT: str x27, [sp, #56] ; 8-byte Folded Spill ; GISEL-NEXT: csel x11, x11, xzr, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x0, eq +; GISEL-NEXT: csel x12, xzr, x22, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: mov x0, x7 -; GISEL-NEXT: orr x12, x20, x12 -; GISEL-NEXT: mov x20, x16 +; GISEL-NEXT: mov x22, x2 +; GISEL-NEXT: orr x12, x2, x12 +; GISEL-NEXT: mov x2, x14 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x10, eq +; GISEL-NEXT: csel x12, xzr, x21, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: ldr x10, [sp, #312] ; 8-byte Folded Reload -; GISEL-NEXT: orr x12, x2, x12 -; GISEL-NEXT: ldr x2, [sp, #304] ; 8-byte Folded Reload +; GISEL-NEXT: ldr x21, [sp, #288] ; 8-byte Folded Reload +; GISEL-NEXT: orr x12, x1, x12 +; GISEL-NEXT: mov x1, x27 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x19, eq +; GISEL-NEXT: csel x12, xzr, x16, eq ; GISEL-NEXT: cmp x9, #3 ; GISEL-NEXT: orr x12, x21, x12 -; GISEL-NEXT: ldr x21, [sp, #200] ; 8-byte Folded Reload ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x6, eq +; GISEL-NEXT: csel x12, xzr, x30, eq ; GISEL-NEXT: cmp x9, #4 -; GISEL-NEXT: orr x12, x21, x12 +; GISEL-NEXT: orr x12, x7, x12 +; GISEL-NEXT: mov x7, x15 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x15, eq +; GISEL-NEXT: str x7, [sp, #40] ; 8-byte Folded Spill +; GISEL-NEXT: csel x12, xzr, x23, eq ; GISEL-NEXT: cmp x9, #5 -; GISEL-NEXT: orr x12, x28, x12 +; GISEL-NEXT: orr x12, x15, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x7, eq +; GISEL-NEXT: csel x12, xzr, x5, eq ; GISEL-NEXT: cmp x9, #6 -; GISEL-NEXT: mov x7, x17 -; GISEL-NEXT: orr x12, x2, x12 +; GISEL-NEXT: mov x5, x19 +; GISEL-NEXT: orr x12, x19, x12 +; GISEL-NEXT: mov x19, x7 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x16, eq +; GISEL-NEXT: csel x12, xzr, x14, eq ; GISEL-NEXT: cmp x9, #7 -; GISEL-NEXT: orr x12, x30, x12 +; GISEL-NEXT: lsr x14, x24, x28 +; GISEL-NEXT: orr x12, x4, x12 +; GISEL-NEXT: mov x4, x10 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x17, eq +; GISEL-NEXT: csel x12, xzr, x25, eq ; GISEL-NEXT: cmp x9, #8 -; GISEL-NEXT: mov x17, x24 -; GISEL-NEXT: orr x12, x1, x12 +; GISEL-NEXT: orr x12, x10, x12 +; GISEL-NEXT: ldr x10, [sp, #304] ; 8-byte Folded Reload ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x14, eq -; GISEL-NEXT: ldr x14, [sp, #8] ; 8-byte Folded Reload +; GISEL-NEXT: csel x12, xzr, x20, eq ; GISEL-NEXT: cmp x9, #9 -; GISEL-NEXT: orr x12, x14, x12 +; GISEL-NEXT: orr x12, x26, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x12, xzr, x25, eq +; GISEL-NEXT: csel x12, xzr, x6, eq ; GISEL-NEXT: cmp x9, #10 -; GISEL-NEXT: orr x12, x26, x12 +; GISEL-NEXT: orr x12, x0, x12 ; GISEL-NEXT: csel x11, x12, x11, eq ; GISEL-NEXT: cmp x9, #11 -; GISEL-NEXT: csel x11, x13, x11, eq +; GISEL-NEXT: csel x11, x17, x11, eq ; GISEL-NEXT: cmp x9, #12 ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #13 @@ -1937,393 +1953,395 @@ define void @test_shl_i1024(ptr %result, ptr %input, i32 %shift) { ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #15 ; GISEL-NEXT: csel x12, xzr, x11, eq +; GISEL-NEXT: ldp x11, x6, [x10, #96] ; GISEL-NEXT: cmp x8, #0 -; GISEL-NEXT: ldp x11, x10, [x10, #96] -; GISEL-NEXT: csel x12, x5, x12, eq -; GISEL-NEXT: str x12, [sp, #96] ; 8-byte Folded Spill -; GISEL-NEXT: mov x12, x22 -; GISEL-NEXT: lsr x22, x5, x22 -; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: mov x5, x27 -; GISEL-NEXT: lsl x24, x11, x24 -; GISEL-NEXT: str x10, [sp, #296] ; 8-byte Folded Spill -; GISEL-NEXT: csel x10, xzr, x22, eq +; GISEL-NEXT: and x10, x8, #0x3f +; GISEL-NEXT: csel x12, x24, x12, eq +; GISEL-NEXT: tst x8, #0x3f +; GISEL-NEXT: ldr x24, [sp, #248] ; 8-byte Folded Reload +; GISEL-NEXT: lsl x15, x11, x10 +; GISEL-NEXT: csel x10, xzr, x14, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: str x22, [sp, #16] ; 8-byte Folded Spill -; GISEL-NEXT: orr x10, x24, x10 +; GISEL-NEXT: str x12, [sp, #136] ; 8-byte Folded Spill +; GISEL-NEXT: ldr x12, [sp, #312] ; 8-byte Folded Reload +; GISEL-NEXT: orr x10, x15, x10 +; GISEL-NEXT: str x15, [sp, #296] ; 8-byte Folded Spill +; GISEL-NEXT: mov x15, x13 ; GISEL-NEXT: csel x10, x10, xzr, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x27, eq +; GISEL-NEXT: csel x13, xzr, x13, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: ldr x27, [sp, #280] ; 8-byte Folded Reload -; GISEL-NEXT: orr x13, x3, x13 -; GISEL-NEXT: mov x3, x26 +; GISEL-NEXT: orr x13, x27, x13 +; GISEL-NEXT: ldr x27, [sp, #240] ; 8-byte Folded Reload ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: csel x13, xzr, x27, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: orr x13, x23, x13 -; GISEL-NEXT: mov x23, x4 +; GISEL-NEXT: orr x13, x22, x13 +; GISEL-NEXT: ldr x22, [sp, #272] ; 8-byte Folded Reload ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x4, eq -; GISEL-NEXT: ldp x4, x16, [sp, #216] ; 16-byte Folded Reload +; GISEL-NEXT: csel x13, xzr, x3, eq ; GISEL-NEXT: cmp x9, #3 -; GISEL-NEXT: orr x13, x16, x13 +; GISEL-NEXT: orr x13, x12, x13 ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x19, eq +; GISEL-NEXT: csel x13, xzr, x16, eq ; GISEL-NEXT: cmp x9, #4 -; GISEL-NEXT: mov x19, x1 -; GISEL-NEXT: orr x13, x4, x13 +; GISEL-NEXT: mov x16, x17 +; GISEL-NEXT: orr x13, x21, x13 +; GISEL-NEXT: ldp x23, x21, [sp, #256] ; 16-byte Folded Reload ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x6, eq +; GISEL-NEXT: csel x13, xzr, x30, eq ; GISEL-NEXT: cmp x9, #5 -; GISEL-NEXT: mov x6, x14 -; GISEL-NEXT: orr x13, x21, x13 +; GISEL-NEXT: mov x30, x0 +; GISEL-NEXT: orr x13, x23, x13 ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x15, eq +; GISEL-NEXT: csel x13, xzr, x21, eq ; GISEL-NEXT: cmp x9, #6 -; GISEL-NEXT: orr x13, x28, x13 +; GISEL-NEXT: orr x13, x7, x13 +; GISEL-NEXT: mov x7, x14 ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x0, eq +; GISEL-NEXT: csel x13, xzr, x24, eq ; GISEL-NEXT: cmp x9, #7 -; GISEL-NEXT: mov x0, x23 -; GISEL-NEXT: orr x13, x2, x13 +; GISEL-NEXT: orr x13, x5, x13 +; GISEL-NEXT: ldr x5, [sp, #48] ; 8-byte Folded Reload ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x20, eq +; GISEL-NEXT: csel x13, xzr, x2, eq ; GISEL-NEXT: cmp x9, #8 -; GISEL-NEXT: orr x13, x30, x13 -; GISEL-NEXT: ldr x30, [sp, #208] ; 8-byte Folded Reload +; GISEL-NEXT: ldr x2, [sp, #296] ; 8-byte Folded Reload +; GISEL-NEXT: orr x13, x5, x13 ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x7, eq +; GISEL-NEXT: csel x13, xzr, x25, eq ; GISEL-NEXT: cmp x9, #9 -; GISEL-NEXT: orr x13, x1, x13 +; GISEL-NEXT: mov x25, x6 +; GISEL-NEXT: orr x13, x4, x13 ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x30, eq +; GISEL-NEXT: csel x13, xzr, x20, eq ; GISEL-NEXT: cmp x9, #10 -; GISEL-NEXT: orr x13, x14, x13 -; GISEL-NEXT: ldp x14, x2, [sp, #264] ; 16-byte Folded Reload +; GISEL-NEXT: orr x13, x26, x13 ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x13, xzr, x25, eq +; GISEL-NEXT: csel x13, xzr, x22, eq ; GISEL-NEXT: cmp x9, #11 -; GISEL-NEXT: orr x13, x26, x13 -; GISEL-NEXT: ldr x26, [sp, #288] ; 8-byte Folded Reload +; GISEL-NEXT: orr x13, x0, x13 ; GISEL-NEXT: csel x10, x13, x10, eq ; GISEL-NEXT: cmp x9, #12 -; GISEL-NEXT: lsr x13, x11, x12 -; GISEL-NEXT: csel x10, x26, x10, eq +; GISEL-NEXT: lsr x13, x11, x28 +; GISEL-NEXT: csel x10, x17, x10, eq ; GISEL-NEXT: cmp x9, #13 +; GISEL-NEXT: ldr x17, [sp, #80] ; 8-byte Folded Reload ; GISEL-NEXT: csel x10, xzr, x10, eq ; GISEL-NEXT: cmp x9, #14 -; GISEL-NEXT: str x13, [sp, #72] ; 8-byte Folded Spill +; GISEL-NEXT: str x13, [sp, #104] ; 8-byte Folded Spill ; GISEL-NEXT: csel x10, xzr, x10, eq ; GISEL-NEXT: cmp x9, #15 ; GISEL-NEXT: csel x10, xzr, x10, eq ; GISEL-NEXT: cmp x8, #0 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: str x10, [sp, #88] ; 8-byte Folded Spill -; GISEL-NEXT: ldr x10, [sp, #296] ; 8-byte Folded Reload -; GISEL-NEXT: lsl x11, x10, x17 +; GISEL-NEXT: str x10, [sp, #128] ; 8-byte Folded Spill +; GISEL-NEXT: and x10, x8, #0x3f +; GISEL-NEXT: lsl x11, x6, x10 ; GISEL-NEXT: csel x10, xzr, x13, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: ldr x17, [sp, #232] ; 8-byte Folded Reload -; GISEL-NEXT: ldr x13, [sp, #256] ; 8-byte Folded Reload +; GISEL-NEXT: ldp x0, x13, [sp, #280] ; 16-byte Folded Reload +; GISEL-NEXT: mov x6, x16 ; GISEL-NEXT: orr x10, x11, x10 -; GISEL-NEXT: str x11, [sp, #56] ; 8-byte Folded Spill +; GISEL-NEXT: str x11, [sp, #88] ; 8-byte Folded Spill ; GISEL-NEXT: csel x10, x10, xzr, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x22, eq +; GISEL-NEXT: csel x11, xzr, x14, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: orr x11, x24, x11 +; GISEL-NEXT: orr x11, x2, x11 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x5, eq +; GISEL-NEXT: csel x11, xzr, x15, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: orr x11, x2, x11 -; GISEL-NEXT: ldp x12, x5, [sp, #240] ; 16-byte Folded Reload +; GISEL-NEXT: mov x15, x3 +; GISEL-NEXT: orr x11, x1, x11 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: csel x11, xzr, x27, eq ; GISEL-NEXT: cmp x9, #3 -; GISEL-NEXT: mov x27, x30 -; GISEL-NEXT: orr x11, x17, x11 +; GISEL-NEXT: orr x11, x0, x11 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x23, eq +; GISEL-NEXT: csel x11, xzr, x3, eq +; GISEL-NEXT: ldp x14, x3, [sp, #320] ; 16-byte Folded Reload ; GISEL-NEXT: cmp x9, #4 -; GISEL-NEXT: mov x23, x20 -; GISEL-NEXT: orr x11, x16, x11 -; GISEL-NEXT: ldr x16, [sp, #304] ; 8-byte Folded Reload +; GISEL-NEXT: orr x11, x12, x11 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x14, eq +; GISEL-NEXT: csel x11, xzr, x3, eq ; GISEL-NEXT: cmp x9, #5 -; GISEL-NEXT: orr x11, x4, x11 +; GISEL-NEXT: orr x11, x13, x11 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x13, eq +; GISEL-NEXT: csel x11, xzr, x14, eq ; GISEL-NEXT: cmp x9, #6 -; GISEL-NEXT: orr x11, x21, x11 -; GISEL-NEXT: ldr x21, [sp, #296] ; 8-byte Folded Reload +; GISEL-NEXT: orr x11, x23, x11 +; GISEL-NEXT: mov x23, x5 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x15, eq +; GISEL-NEXT: csel x11, xzr, x21, eq ; GISEL-NEXT: cmp x9, #7 -; GISEL-NEXT: orr x11, x28, x11 +; GISEL-NEXT: mov x21, x4 +; GISEL-NEXT: orr x11, x19, x11 +; GISEL-NEXT: ldp x12, x19, [sp, #64] ; 16-byte Folded Reload ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x12, eq +; GISEL-NEXT: csel x11, xzr, x24, eq ; GISEL-NEXT: cmp x9, #8 -; GISEL-NEXT: orr x11, x16, x11 +; GISEL-NEXT: orr x11, x12, x11 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x20, eq +; GISEL-NEXT: csel x11, xzr, x19, eq ; GISEL-NEXT: cmp x9, #9 ; GISEL-NEXT: orr x11, x5, x11 +; GISEL-NEXT: mov x5, x30 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x7, eq +; GISEL-NEXT: csel x11, xzr, x17, eq ; GISEL-NEXT: cmp x9, #10 -; GISEL-NEXT: orr x11, x1, x11 -; GISEL-NEXT: ldr x1, [sp, #312] ; 8-byte Folded Reload +; GISEL-NEXT: orr x11, x4, x11 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x30, eq +; GISEL-NEXT: csel x11, xzr, x20, eq ; GISEL-NEXT: cmp x9, #11 -; GISEL-NEXT: orr x11, x6, x11 +; GISEL-NEXT: orr x11, x26, x11 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x25, eq +; GISEL-NEXT: csel x11, xzr, x22, eq ; GISEL-NEXT: cmp x9, #12 -; GISEL-NEXT: orr x11, x3, x11 +; GISEL-NEXT: orr x11, x30, x11 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: cmp x9, #13 -; GISEL-NEXT: csel x10, x26, x10, eq +; GISEL-NEXT: csel x10, x16, x10, eq ; GISEL-NEXT: cmp x9, #14 +; GISEL-NEXT: ldr x16, [sp, #304] ; 8-byte Folded Reload ; GISEL-NEXT: csel x10, xzr, x10, eq ; GISEL-NEXT: cmp x9, #15 ; GISEL-NEXT: csel x11, xzr, x10, eq ; GISEL-NEXT: cmp x8, #0 -; GISEL-NEXT: csel x11, x21, x11, eq -; GISEL-NEXT: ldp x10, x20, [x1, #112] -; GISEL-NEXT: str x11, [sp, #80] ; 8-byte Folded Spill -; GISEL-NEXT: ldp x11, x4, [sp, #40] ; 16-byte Folded Reload +; GISEL-NEXT: ldp x10, x4, [x16, #112] +; GISEL-NEXT: csel x11, x25, x11, eq +; GISEL-NEXT: str x11, [sp, #120] ; 8-byte Folded Spill +; GISEL-NEXT: lsr x11, x25, x28 +; GISEL-NEXT: and x16, x8, #0x3f ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: lsr x21, x21, x4 -; GISEL-NEXT: lsl x28, x10, x11 -; GISEL-NEXT: csel x1, xzr, x21, eq -; GISEL-NEXT: str x21, [sp, #296] ; 8-byte Folded Spill +; GISEL-NEXT: ldr x25, [sp, #88] ; 8-byte Folded Reload +; GISEL-NEXT: lsl x24, x10, x16 +; GISEL-NEXT: csel x1, xzr, x11, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: orr x1, x28, x1 -; GISEL-NEXT: ldr x21, [sp, #72] ; 8-byte Folded Reload -; GISEL-NEXT: str x28, [sp, #312] ; 8-byte Folded Spill +; GISEL-NEXT: ldp x16, x28, [sp, #96] ; 16-byte Folded Reload +; GISEL-NEXT: orr x1, x24, x1 ; GISEL-NEXT: csel x1, x1, xzr, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: ldr x28, [sp, #56] ; 8-byte Folded Reload -; GISEL-NEXT: csel x30, xzr, x21, eq +; GISEL-NEXT: csel x30, xzr, x28, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: orr x30, x28, x30 +; GISEL-NEXT: orr x30, x25, x30 ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x30, xzr, x22, eq +; GISEL-NEXT: csel x30, xzr, x7, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: ldr x22, [sp, #64] ; 8-byte Folded Reload -; GISEL-NEXT: orr x30, x24, x30 +; GISEL-NEXT: orr x30, x2, x30 +; GISEL-NEXT: ldr x2, [sp, #56] ; 8-byte Folded Reload ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x30, xzr, x22, eq +; GISEL-NEXT: csel x30, xzr, x16, eq ; GISEL-NEXT: cmp x9, #3 ; GISEL-NEXT: orr x30, x2, x30 -; GISEL-NEXT: ldr x2, [sp, #280] ; 8-byte Folded Reload ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x30, xzr, x2, eq +; GISEL-NEXT: csel x30, xzr, x27, eq ; GISEL-NEXT: cmp x9, #4 -; GISEL-NEXT: orr x30, x17, x30 -; GISEL-NEXT: ldr x17, [sp, #224] ; 8-byte Folded Reload +; GISEL-NEXT: mov x27, x13 +; GISEL-NEXT: orr x30, x0, x30 +; GISEL-NEXT: ldr x0, [sp, #248] ; 8-byte Folded Reload ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x30, xzr, x0, eq +; GISEL-NEXT: csel x30, xzr, x15, eq +; GISEL-NEXT: ldr x15, [sp, #312] ; 8-byte Folded Reload ; GISEL-NEXT: cmp x9, #5 -; GISEL-NEXT: orr x30, x17, x30 +; GISEL-NEXT: orr x30, x15, x30 ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x30, xzr, x14, eq -; GISEL-NEXT: ldr x14, [sp, #216] ; 8-byte Folded Reload +; GISEL-NEXT: csel x30, xzr, x3, eq ; GISEL-NEXT: cmp x9, #6 -; GISEL-NEXT: orr x30, x14, x30 +; GISEL-NEXT: ldr x3, [sp, #40] ; 8-byte Folded Reload +; GISEL-NEXT: orr x30, x13, x30 ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x30, xzr, x13, eq -; GISEL-NEXT: ldr x13, [sp, #200] ; 8-byte Folded Reload +; GISEL-NEXT: csel x30, xzr, x14, eq +; GISEL-NEXT: ldp x13, x14, [sp, #256] ; 16-byte Folded Reload ; GISEL-NEXT: cmp x9, #7 ; GISEL-NEXT: orr x30, x13, x30 ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x30, xzr, x15, eq -; GISEL-NEXT: ldr x15, [sp, #32] ; 8-byte Folded Reload +; GISEL-NEXT: csel x30, xzr, x14, eq ; GISEL-NEXT: cmp x9, #8 -; GISEL-NEXT: orr x30, x15, x30 +; GISEL-NEXT: orr x30, x3, x30 ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x30, xzr, x12, eq +; GISEL-NEXT: csel x30, xzr, x0, eq ; GISEL-NEXT: cmp x9, #9 -; GISEL-NEXT: orr x30, x16, x30 +; GISEL-NEXT: orr x30, x12, x30 ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x30, xzr, x23, eq +; GISEL-NEXT: csel x30, xzr, x19, eq ; GISEL-NEXT: cmp x9, #10 -; GISEL-NEXT: orr x30, x5, x30 +; GISEL-NEXT: orr x30, x23, x30 ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x30, xzr, x7, eq +; GISEL-NEXT: csel x30, xzr, x17, eq ; GISEL-NEXT: cmp x9, #11 -; GISEL-NEXT: orr x30, x19, x30 +; GISEL-NEXT: orr x30, x21, x30 ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x30, xzr, x27, eq +; GISEL-NEXT: csel x30, xzr, x20, eq ; GISEL-NEXT: cmp x9, #12 -; GISEL-NEXT: orr x30, x6, x30 +; GISEL-NEXT: mov x20, x26 +; GISEL-NEXT: orr x30, x26, x30 +; GISEL-NEXT: mov x26, x5 ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x30, xzr, x25, eq +; GISEL-NEXT: csel x30, xzr, x22, eq ; GISEL-NEXT: cmp x9, #13 -; GISEL-NEXT: orr x30, x3, x30 +; GISEL-NEXT: orr x30, x5, x30 +; GISEL-NEXT: ldr x5, [sp, #16] ; 8-byte Folded Reload ; GISEL-NEXT: csel x1, x30, x1, eq ; GISEL-NEXT: cmp x9, #14 -; GISEL-NEXT: lsr x30, x10, x4 -; GISEL-NEXT: csel x1, x26, x1, eq +; GISEL-NEXT: csel x1, x6, x1, eq ; GISEL-NEXT: cmp x9, #15 +; GISEL-NEXT: lsr x30, x10, x5 ; GISEL-NEXT: csel x1, xzr, x1, eq ; GISEL-NEXT: cmp x8, #0 -; GISEL-NEXT: csel x26, x10, x1, eq -; GISEL-NEXT: lsl x10, x20, x11 +; GISEL-NEXT: csel x5, x10, x1, eq +; GISEL-NEXT: and x10, x8, #0x3f ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x16, xzr, x30, eq +; GISEL-NEXT: lsl x10, x4, x10 +; GISEL-NEXT: csel x1, xzr, x30, eq ; GISEL-NEXT: cmp x9, #0 -; GISEL-NEXT: ldr x11, [sp, #296] ; 8-byte Folded Reload -; GISEL-NEXT: orr x10, x10, x16 -; GISEL-NEXT: ldr x16, [sp, #312] ; 8-byte Folded Reload +; GISEL-NEXT: ldp x29, x30, [sp, #416] ; 16-byte Folded Reload +; GISEL-NEXT: orr x10, x10, x1 +; GISEL-NEXT: ldr x1, [sp, #296] ; 8-byte Folded Reload ; GISEL-NEXT: csel x10, x10, xzr, eq ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #1 -; GISEL-NEXT: orr x11, x16, x11 -; GISEL-NEXT: ldr x16, [sp, #272] ; 8-byte Folded Reload +; GISEL-NEXT: orr x11, x24, x11 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x21, eq +; GISEL-NEXT: csel x11, xzr, x28, eq ; GISEL-NEXT: cmp x9, #2 -; GISEL-NEXT: orr x11, x28, x11 -; GISEL-NEXT: ldp x29, x30, [sp, #400] ; 16-byte Folded Reload +; GISEL-NEXT: orr x11, x25, x11 ; GISEL-NEXT: csel x10, x11, x10, eq -; GISEL-NEXT: ldr x11, [sp, #16] ; 8-byte Folded Reload ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: csel x11, xzr, x7, eq ; GISEL-NEXT: cmp x9, #3 -; GISEL-NEXT: orr x11, x24, x11 +; GISEL-NEXT: orr x11, x1, x11 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x22, eq +; GISEL-NEXT: csel x11, xzr, x16, eq ; GISEL-NEXT: cmp x9, #4 -; GISEL-NEXT: orr x11, x16, x11 -; GISEL-NEXT: ldr x16, [sp, #232] ; 8-byte Folded Reload +; GISEL-NEXT: ldr x16, [sp, #280] ; 8-byte Folded Reload +; GISEL-NEXT: orr x11, x2, x11 ; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: ldr x11, [sp, #240] ; 8-byte Folded Reload ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x2, eq +; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #5 ; GISEL-NEXT: orr x11, x16, x11 -; GISEL-NEXT: ldp x22, x21, [sp, #368] ; 16-byte Folded Reload ; GISEL-NEXT: csel x10, x11, x10, eq +; GISEL-NEXT: ldr x11, [sp, #32] ; 8-byte Folded Reload ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x0, eq +; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #6 -; GISEL-NEXT: orr x11, x17, x11 +; GISEL-NEXT: orr x11, x15, x11 ; GISEL-NEXT: csel x10, x11, x10, eq -; GISEL-NEXT: ldr x11, [sp, #264] ; 8-byte Folded Reload +; GISEL-NEXT: ldr x11, [sp, #328] ; 8-byte Folded Reload ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #7 -; GISEL-NEXT: orr x11, x14, x11 +; GISEL-NEXT: orr x11, x27, x11 +; GISEL-NEXT: ldp x28, x27, [sp, #336] ; 16-byte Folded Reload ; GISEL-NEXT: csel x10, x11, x10, eq -; GISEL-NEXT: ldr x11, [sp, #256] ; 8-byte Folded Reload +; GISEL-NEXT: ldr x11, [sp, #320] ; 8-byte Folded Reload ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #8 ; GISEL-NEXT: orr x11, x13, x11 -; GISEL-NEXT: ldr x13, [sp, #112] ; 8-byte Folded Reload +; GISEL-NEXT: ldr x13, [sp, #144] ; 8-byte Folded Reload ; GISEL-NEXT: csel x10, x11, x10, eq -; GISEL-NEXT: ldr x11, [sp, #24] ; 8-byte Folded Reload ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x11, eq +; GISEL-NEXT: csel x11, xzr, x14, eq ; GISEL-NEXT: cmp x9, #9 -; GISEL-NEXT: orr x11, x15, x11 +; GISEL-NEXT: orr x11, x3, x11 ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: tst x8, #0x3f -; GISEL-NEXT: csel x11, xzr, x12, eq -; GISEL-NEXT: ldr x12, [sp, #304] ; 8-byte Folded Reload +; GISEL-NEXT: csel x11, xzr, x0, eq ; GISEL-NEXT: cmp x9, #10 ; GISEL-NEXT: orr x11, x12, x11 ; GISEL-NEXT: csel x10, x11, x10, eq -; GISEL-NEXT: ldr x11, [sp, #192] ; 8-byte Folded Reload +; GISEL-NEXT: ldr x11, [sp, #232] ; 8-byte Folded Reload ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: str x11, [x13] -; GISEL-NEXT: ldp x12, x11, [sp, #176] ; 16-byte Folded Reload +; GISEL-NEXT: ldp x12, x11, [sp, #216] ; 16-byte Folded Reload ; GISEL-NEXT: stp x11, x12, [x13, #8] -; GISEL-NEXT: csel x11, xzr, x23, eq +; GISEL-NEXT: csel x11, xzr, x19, eq ; GISEL-NEXT: cmp x9, #11 -; GISEL-NEXT: orr x11, x5, x11 -; GISEL-NEXT: ldp x24, x23, [sp, #352] ; 16-byte Folded Reload +; GISEL-NEXT: orr x11, x23, x11 +; GISEL-NEXT: ldp x24, x23, [sp, #368] ; 16-byte Folded Reload ; GISEL-NEXT: csel x10, x11, x10, eq -; GISEL-NEXT: ldr x11, [sp, #168] ; 8-byte Folded Reload +; GISEL-NEXT: ldr x11, [sp, #208] ; 8-byte Folded Reload ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: str x11, [x13, #24] -; GISEL-NEXT: ldp x12, x11, [sp, #152] ; 16-byte Folded Reload +; GISEL-NEXT: ldp x12, x11, [sp, #192] ; 16-byte Folded Reload ; GISEL-NEXT: stp x11, x12, [x13, #32] -; GISEL-NEXT: csel x11, xzr, x7, eq +; GISEL-NEXT: csel x11, xzr, x17, eq ; GISEL-NEXT: cmp x9, #12 -; GISEL-NEXT: orr x11, x19, x11 +; GISEL-NEXT: orr x11, x21, x11 ; GISEL-NEXT: csel x10, x11, x10, eq -; GISEL-NEXT: ldr x11, [sp, #144] ; 8-byte Folded Reload +; GISEL-NEXT: ldr x11, [sp, #184] ; 8-byte Folded Reload ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: str x11, [x13, #48] -; GISEL-NEXT: ldp x12, x11, [sp, #128] ; 16-byte Folded Reload +; GISEL-NEXT: ldp x12, x11, [sp, #168] ; 16-byte Folded Reload ; GISEL-NEXT: stp x11, x12, [x13, #56] -; GISEL-NEXT: csel x11, xzr, x27, eq +; GISEL-NEXT: ldr x11, [sp, #112] ; 8-byte Folded Reload +; GISEL-NEXT: ldr x12, [sp, #136] ; 8-byte Folded Reload +; GISEL-NEXT: csel x11, xzr, x11, eq ; GISEL-NEXT: cmp x9, #13 -; GISEL-NEXT: orr x11, x6, x11 -; GISEL-NEXT: ldp x28, x27, [sp, #320] ; 16-byte Folded Reload +; GISEL-NEXT: orr x11, x20, x11 +; GISEL-NEXT: ldp x20, x19, [sp, #400] ; 16-byte Folded Reload ; GISEL-NEXT: csel x10, x11, x10, eq -; GISEL-NEXT: ldr x11, [sp, #120] ; 8-byte Folded Reload +; GISEL-NEXT: ldr x11, [sp, #160] ; 8-byte Folded Reload ; GISEL-NEXT: tst x8, #0x3f ; GISEL-NEXT: str x11, [x13, #72] -; GISEL-NEXT: ldp x12, x11, [sp, #96] ; 16-byte Folded Reload -; GISEL-NEXT: stp x11, x12, [x13, #80] -; GISEL-NEXT: csel x11, xzr, x25, eq +; GISEL-NEXT: ldr x11, [sp, #152] ; 8-byte Folded Reload +; GISEL-NEXT: str x11, [x13, #80] +; GISEL-NEXT: csel x11, xzr, x22, eq ; GISEL-NEXT: cmp x9, #14 -; GISEL-NEXT: orr x11, x3, x11 +; GISEL-NEXT: orr x11, x26, x11 +; GISEL-NEXT: ldp x22, x21, [sp, #384] ; 16-byte Folded Reload ; GISEL-NEXT: csel x10, x11, x10, eq ; GISEL-NEXT: cmp x9, #15 -; GISEL-NEXT: ldr x9, [sp, #288] ; 8-byte Folded Reload -; GISEL-NEXT: ldr x11, [sp, #88] ; 8-byte Folded Reload -; GISEL-NEXT: csel x9, x9, x10, eq +; GISEL-NEXT: ldr x9, [sp, #128] ; 8-byte Folded Reload +; GISEL-NEXT: ldp x26, x25, [sp, #352] ; 16-byte Folded Reload +; GISEL-NEXT: stp x12, x9, [x13, #88] +; GISEL-NEXT: csel x9, x6, x10, eq ; GISEL-NEXT: cmp x8, #0 -; GISEL-NEXT: ldr x8, [sp, #80] ; 8-byte Folded Reload -; GISEL-NEXT: stp x11, x8, [x13, #96] -; GISEL-NEXT: csel x8, x20, x9, eq -; GISEL-NEXT: stp x26, x8, [x13, #112] -; GISEL-NEXT: ldp x20, x19, [sp, #384] ; 16-byte Folded Reload -; GISEL-NEXT: ldp x26, x25, [sp, #336] ; 16-byte Folded Reload -; GISEL-NEXT: add sp, sp, #416 +; GISEL-NEXT: ldr x8, [sp, #120] ; 8-byte Folded Reload +; GISEL-NEXT: stp x8, x5, [x13, #104] +; GISEL-NEXT: csel x8, x4, x9, eq +; GISEL-NEXT: str x8, [x13, #120] +; GISEL-NEXT: add sp, sp, #432 ; GISEL-NEXT: ret entry: %input_val = load i1024, ptr %input, align 128 diff --git a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll index 63c08dd..b215c51 100644 --- a/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll +++ b/llvm/test/CodeGen/AArch64/aarch64-matrix-umull-smull.ll @@ -267,7 +267,7 @@ define void @larger_smull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr ; CHECK-SD-NEXT: and x9, x8, #0xfffffff0 ; CHECK-SD-NEXT: add x10, x2, #32 ; CHECK-SD-NEXT: add x11, x0, #16 -; CHECK-SD-NEXT: mov x12, x9 +; CHECK-SD-NEXT: and x12, x8, #0xfffffff0 ; CHECK-SD-NEXT: .LBB3_4: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: ldp q1, q2, [x11, #-16] @@ -313,7 +313,7 @@ define void @larger_smull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr ; CHECK-GI-NEXT: and x10, x9, #0xfffffff0 ; CHECK-GI-NEXT: add x11, x2, #32 ; CHECK-GI-NEXT: add x12, x0, #16 -; CHECK-GI-NEXT: mov x13, x10 +; CHECK-GI-NEXT: and x13, x9, #0xfffffff0 ; CHECK-GI-NEXT: xtn v0.4h, v0.4s ; CHECK-GI-NEXT: .LBB3_3: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 @@ -428,7 +428,7 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr ; CHECK-SD-NEXT: and x9, x8, #0xfffffff0 ; CHECK-SD-NEXT: add x10, x2, #32 ; CHECK-SD-NEXT: add x11, x0, #16 -; CHECK-SD-NEXT: mov x12, x9 +; CHECK-SD-NEXT: and x12, x8, #0xfffffff0 ; CHECK-SD-NEXT: .LBB4_4: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: ldp q1, q2, [x11, #-16] @@ -472,7 +472,7 @@ define void @larger_umull(ptr nocapture noundef readonly %x, i16 noundef %y, ptr ; CHECK-GI-NEXT: and x8, x9, #0xfffffff0 ; CHECK-GI-NEXT: add x10, x2, #32 ; CHECK-GI-NEXT: add x11, x0, #16 -; CHECK-GI-NEXT: mov x12, x8 +; CHECK-GI-NEXT: and x12, x9, #0xfffffff0 ; CHECK-GI-NEXT: .LBB4_3: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: and w13, w1, #0xffff @@ -596,7 +596,7 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A, ; CHECK-SD-NEXT: and x11, x10, #0xfffffff0 ; CHECK-SD-NEXT: fmov s2, w9 ; CHECK-SD-NEXT: add x8, x0, #8 -; CHECK-SD-NEXT: mov x12, x11 +; CHECK-SD-NEXT: and x12, x10, #0xfffffff0 ; CHECK-SD-NEXT: .LBB5_5: // %vector.body ; CHECK-SD-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-SD-NEXT: ldp d3, d4, [x8, #-8] @@ -646,10 +646,10 @@ define i16 @red_mla_dup_ext_u8_s8_s16(ptr noalias nocapture noundef readonly %A, ; CHECK-GI-NEXT: movi v0.2d, #0000000000000000 ; CHECK-GI-NEXT: movi v1.2d, #0000000000000000 ; CHECK-GI-NEXT: add x10, x0, #8 +; CHECK-GI-NEXT: and x11, x8, #0xfffffff0 ; CHECK-GI-NEXT: sbfx w9, w9, #8, #8 ; CHECK-GI-NEXT: dup v2.8h, w9 ; CHECK-GI-NEXT: and x9, x8, #0xfffffff0 -; CHECK-GI-NEXT: mov x11, x9 ; CHECK-GI-NEXT: .LBB5_5: // %vector.body ; CHECK-GI-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-GI-NEXT: ldp d3, d4, [x10, #-8] diff --git a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll index 4c8e589..c23e4e1 100644 --- a/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll +++ b/llvm/test/CodeGen/AArch64/machine-combiner-copy.ll @@ -17,7 +17,7 @@ define void @fma_dup_f16(ptr noalias nocapture noundef readonly %A, half noundef ; CHECK-NEXT: and x9, x8, #0xfffffff0 ; CHECK-NEXT: add x10, x1, #16 ; CHECK-NEXT: add x11, x0, #16 -; CHECK-NEXT: mov x12, x9 +; CHECK-NEXT: and x12, x8, #0xfffffff0 ; CHECK-NEXT: .LBB0_4: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldp q1, q4, [x10, #-16] diff --git a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll index f6bbdf5..1770bb9 100644 --- a/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll +++ b/llvm/test/CodeGen/AArch64/machine-licm-sub-loop.ll @@ -14,7 +14,6 @@ define void @foo(i32 noundef %limit, ptr %out, ptr %y) { ; CHECK-NEXT: mov x9, xzr ; CHECK-NEXT: and x12, x10, #0xfffffff0 ; CHECK-NEXT: add x13, x1, #32 -; CHECK-NEXT: add x14, x2, #16 ; CHECK-NEXT: b .LBB0_3 ; CHECK-NEXT: .LBB0_2: // %for.cond1.for.cond.cleanup3_crit_edge.us ; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 @@ -27,52 +26,52 @@ define void @foo(i32 noundef %limit, ptr %out, ptr %y) { ; CHECK-NEXT: // =>This Loop Header: Depth=1 ; CHECK-NEXT: // Child Loop BB0_6 Depth 2 ; CHECK-NEXT: // Child Loop BB0_9 Depth 2 -; CHECK-NEXT: ldrsh w15, [x2, x9, lsl #1] +; CHECK-NEXT: ldrsh w14, [x2, x9, lsl #1] ; CHECK-NEXT: cmp w0, #16 ; CHECK-NEXT: b.hs .LBB0_5 ; CHECK-NEXT: // %bb.4: // in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: mov x18, xzr +; CHECK-NEXT: mov x17, xzr ; CHECK-NEXT: b .LBB0_8 ; CHECK-NEXT: .LBB0_5: // %vector.ph ; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: dup v0.8h, w15 -; CHECK-NEXT: mov x16, x14 -; CHECK-NEXT: mov x17, x13 -; CHECK-NEXT: mov x18, x12 +; CHECK-NEXT: dup v0.8h, w14 +; CHECK-NEXT: add x15, x2, #16 +; CHECK-NEXT: mov x16, x13 +; CHECK-NEXT: and x17, x10, #0xfffffff0 ; CHECK-NEXT: .LBB0_6: // %vector.body ; CHECK-NEXT: // Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldp q1, q4, [x16, #-16] -; CHECK-NEXT: subs x18, x18, #16 -; CHECK-NEXT: ldp q3, q2, [x17, #-32] -; CHECK-NEXT: add x16, x16, #32 -; CHECK-NEXT: ldp q6, q5, [x17] +; CHECK-NEXT: ldp q1, q4, [x15, #-16] +; CHECK-NEXT: subs x17, x17, #16 +; CHECK-NEXT: ldp q3, q2, [x16, #-32] +; CHECK-NEXT: add x15, x15, #32 +; CHECK-NEXT: ldp q6, q5, [x16] ; CHECK-NEXT: smlal2 v2.4s, v0.8h, v1.8h ; CHECK-NEXT: smlal v3.4s, v0.4h, v1.4h ; CHECK-NEXT: smlal2 v5.4s, v0.8h, v4.8h ; CHECK-NEXT: smlal v6.4s, v0.4h, v4.4h -; CHECK-NEXT: stp q3, q2, [x17, #-32] -; CHECK-NEXT: stp q6, q5, [x17], #64 +; CHECK-NEXT: stp q3, q2, [x16, #-32] +; CHECK-NEXT: stp q6, q5, [x16], #64 ; CHECK-NEXT: b.ne .LBB0_6 ; CHECK-NEXT: // %bb.7: // %middle.block ; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 ; CHECK-NEXT: cmp x12, x10 -; CHECK-NEXT: mov x18, x12 +; CHECK-NEXT: and x17, x10, #0xfffffff0 ; CHECK-NEXT: b.eq .LBB0_2 ; CHECK-NEXT: .LBB0_8: // %for.body4.us.preheader ; CHECK-NEXT: // in Loop: Header=BB0_3 Depth=1 -; CHECK-NEXT: add x16, x18, x8 -; CHECK-NEXT: add x17, x2, x18, lsl #1 -; CHECK-NEXT: sub x18, x10, x18 -; CHECK-NEXT: add x16, x1, x16, lsl #2 +; CHECK-NEXT: add x15, x17, x8 +; CHECK-NEXT: add x16, x2, x17, lsl #1 +; CHECK-NEXT: sub x17, x10, x17 +; CHECK-NEXT: add x15, x1, x15, lsl #2 ; CHECK-NEXT: .LBB0_9: // %for.body4.us ; CHECK-NEXT: // Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: // => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldrsh w3, [x17], #2 -; CHECK-NEXT: ldr w4, [x16] -; CHECK-NEXT: subs x18, x18, #1 -; CHECK-NEXT: madd w3, w3, w15, w4 -; CHECK-NEXT: str w3, [x16], #4 +; CHECK-NEXT: ldrsh w18, [x16], #2 +; CHECK-NEXT: ldr w3, [x15] +; CHECK-NEXT: subs x17, x17, #1 +; CHECK-NEXT: madd w18, w18, w14, w3 +; CHECK-NEXT: str w18, [x15], #4 ; CHECK-NEXT: b.ne .LBB0_9 ; CHECK-NEXT: b .LBB0_2 ; CHECK-NEXT: .LBB0_10: // %for.cond.cleanup diff --git a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll index 3caac1d..74b0e69 100644 --- a/llvm/test/CodeGen/AArch64/peephole-and-tst.ll +++ b/llvm/test/CodeGen/AArch64/peephole-and-tst.ll @@ -278,9 +278,9 @@ define i64 @test_and_4(i64 %x, i64 %y) { ; CHECK-GI-NEXT: .cfi_offset w19, -8 ; CHECK-GI-NEXT: .cfi_offset w20, -16 ; CHECK-GI-NEXT: .cfi_offset w30, -32 -; CHECK-GI-NEXT: and x20, x0, #0x3 ; CHECK-GI-NEXT: mov x19, x0 -; CHECK-GI-NEXT: mov x0, x20 +; CHECK-GI-NEXT: and x20, x0, #0x3 +; CHECK-GI-NEXT: and x0, x0, #0x3 ; CHECK-GI-NEXT: bl callee ; CHECK-GI-NEXT: tst x19, #0x3 ; CHECK-GI-NEXT: csel x0, x20, x0, eq diff --git a/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll b/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll index e0f2155..58c01db 100644 --- a/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll +++ b/llvm/test/CodeGen/AArch64/reserveXreg-for-regalloc.ll @@ -7,20 +7,16 @@ define void @foo(i64 %v1, i64 %v2, ptr %ptr) { ; CHECK-LABEL: foo: ; CHECK: // %bb.0: -; CHECK-NEXT: sub sp, sp, #16 -; CHECK-NEXT: .cfi_def_cfa_offset 16 ; CHECK-NEXT: add x3, x0, x1 -; CHECK-NEXT: str x3, [sp, #8] // 8-byte Folded Spill ; CHECK-NEXT: str x3, [x2, #8] ; CHECK-NEXT: ldr x3, [x2, #16] ; CHECK-NEXT: add x3, x0, x3 ; CHECK-NEXT: sub x3, x3, x1 ; CHECK-NEXT: str x3, [x2, #16] -; CHECK-NEXT: ldr x3, [sp, #8] // 8-byte Folded Reload +; CHECK-NEXT: add x3, x0, x1 ; CHECK-NEXT: str x3, [x2, #24] ; CHECK-NEXT: str x0, [x2, #32] ; CHECK-NEXT: str x1, [x2, #40] -; CHECK-NEXT: add sp, sp, #16 ; CHECK-NEXT: ret %v3 = add i64 %v1, %v2 %p1 = getelementptr i64, ptr %ptr, i64 1 diff --git a/llvm/test/CodeGen/AArch64/tbl-loops.ll b/llvm/test/CodeGen/AArch64/tbl-loops.ll index 5fc996a..0f62997 100644 --- a/llvm/test/CodeGen/AArch64/tbl-loops.ll +++ b/llvm/test/CodeGen/AArch64/tbl-loops.ll @@ -23,7 +23,7 @@ define void @loop1(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n ; CHECK-NEXT: add x13, x1, #16 ; CHECK-NEXT: add x8, x1, x10, lsl #2 ; CHECK-NEXT: add x9, x0, x10 -; CHECK-NEXT: mov x14, x10 +; CHECK-NEXT: and x14, x11, #0x1fffffff8 ; CHECK-NEXT: .LBB0_4: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ldp q1, q2, [x13, #-16] @@ -194,9 +194,9 @@ define void @loop2(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n ; CHECK-NEXT: mov w8, #1132396544 // =0x437f0000 ; CHECK-NEXT: and x10, x11, #0x1fffffffc ; CHECK-NEXT: dup v0.4s, w8 +; CHECK-NEXT: and x12, x11, #0x1fffffffc ; CHECK-NEXT: add x8, x1, x10, lsl #3 ; CHECK-NEXT: add x9, x0, x10, lsl #1 -; CHECK-NEXT: mov x12, x10 ; CHECK-NEXT: .LBB1_9: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld2 { v1.4s, v2.4s }, [x1], #32 @@ -341,7 +341,7 @@ define void @loop3(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n ; CHECK-NEXT: dup v0.4s, w8 ; CHECK-NEXT: ldr q1, [x12, :lo12:.LCPI2_0] ; CHECK-NEXT: add x9, x10, x10, lsl #1 -; CHECK-NEXT: mov x12, x10 +; CHECK-NEXT: and x12, x11, #0x1fffffffc ; CHECK-NEXT: add x8, x1, x9, lsl #2 ; CHECK-NEXT: add x9, x0, x9 ; CHECK-NEXT: .LBB2_4: // %vector.body @@ -597,7 +597,7 @@ define void @loop4(ptr noalias nocapture noundef writeonly %dst, ptr nocapture n ; CHECK-NEXT: ldr q1, [x12, :lo12:.LCPI3_0] ; CHECK-NEXT: add x8, x1, x10, lsl #4 ; CHECK-NEXT: add x9, x0, x10, lsl #2 -; CHECK-NEXT: mov x12, x10 +; CHECK-NEXT: and x12, x11, #0x1fffffffc ; CHECK-NEXT: .LBB3_9: // %vector.body ; CHECK-NEXT: // =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: ld4 { v2.4s, v3.4s, v4.4s, v5.4s }, [x1], #64 diff --git a/llvm/test/CodeGen/AArch64/trampoline.ll b/llvm/test/CodeGen/AArch64/trampoline.ll index 0e68270..3e933fa 100644 --- a/llvm/test/CodeGen/AArch64/trampoline.ll +++ b/llvm/test/CodeGen/AArch64/trampoline.ll @@ -263,3 +263,9 @@ define i64 @func2() { %fp = call ptr @llvm.adjust.trampoline(ptr @trampg) ret i64 0 } + +; Check for the explicitly emitted .note.GNU-stack section (ELF only) in the +; presence of trampolines. +; UTC_ARGS: --disable +; CHECK-LINUX: .section ".note.GNU-stack","x",@progbits +; UTC_ARGS: --enable diff --git a/llvm/test/CodeGen/ARM/combine-movc-sub.ll b/llvm/test/CodeGen/ARM/combine-movc-sub.ll index ca5d089..8ca4c43 100644 --- a/llvm/test/CodeGen/ARM/combine-movc-sub.ll +++ b/llvm/test/CodeGen/ARM/combine-movc-sub.ll @@ -27,11 +27,11 @@ define hidden fastcc ptr @test(ptr %Search, ptr %ClauseList, i32 %Level, ptr noc ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, lr} ; CHECK-NEXT: sub sp, #4 -; CHECK-NEXT: sub.w r7, r2, #32 -; CHECK-NEXT: mov r8, r0 +; CHECK-NEXT: sub.w r8, r2, #32 +; CHECK-NEXT: mov r6, r0 ; CHECK-NEXT: movs r0, #1 ; CHECK-NEXT: mov r4, r2 -; CHECK-NEXT: add.w r6, r0, r7, lsr #5 +; CHECK-NEXT: add.w r7, r0, r8, lsr #5 ; CHECK-NEXT: mov r5, r1 ; CHECK-NEXT: mov.w r9, #0 ; CHECK-NEXT: b .LBB0_2 @@ -44,16 +44,16 @@ define hidden fastcc ptr @test(ptr %Search, ptr %ClauseList, i32 %Level, ptr noc ; CHECK-NEXT: mov r2, r4 ; CHECK-NEXT: cmp r4, #31 ; CHECK-NEXT: ldr r0, [r1, #16] -; CHECK-NEXT: add.w r0, r0, r6, lsl #2 +; CHECK-NEXT: add.w r0, r0, r7, lsl #2 ; CHECK-NEXT: ldr r0, [r0, #40] ; CHECK-NEXT: it hi -; CHECK-NEXT: andhi r2, r7, #31 +; CHECK-NEXT: andhi r2, r8, #31 ; CHECK-NEXT: lsrs r0, r2 ; CHECK-NEXT: lsls r0, r0, #31 ; CHECK-NEXT: beq .LBB0_1 ; CHECK-NEXT: @ %bb.3: @ %if.then ; CHECK-NEXT: @ in Loop: Header=BB0_2 Depth=1 -; CHECK-NEXT: mov r0, r8 +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: bl foo ; CHECK-NEXT: str.w r9, [r5, #4] ; CHECK-NEXT: b .LBB0_1 diff --git a/llvm/test/CodeGen/ARM/extract-bits.ll b/llvm/test/CodeGen/ARM/extract-bits.ll index 77deaa5..d717806 100644 --- a/llvm/test/CodeGen/ARM/extract-bits.ll +++ b/llvm/test/CodeGen/ARM/extract-bits.ll @@ -316,28 +316,28 @@ define i64 @bextr64_a0(i64 %val, i64 %numskipbits, i64 %numlowbits) nounwind { ; ; V7A-LABEL: bextr64_a0: ; V7A: @ %bb.0: -; V7A-NEXT: .save {r4, lr} -; V7A-NEXT: push {r4, lr} -; V7A-NEXT: ldr r12, [sp, #8] -; V7A-NEXT: mov lr, #1 +; V7A-NEXT: .save {r4, r5, r11, lr} +; V7A-NEXT: push {r4, r5, r11, lr} +; V7A-NEXT: ldr lr, [sp, #16] +; V7A-NEXT: mov r5, #1 ; V7A-NEXT: lsr r0, r0, r2 -; V7A-NEXT: rsb r3, r12, #32 -; V7A-NEXT: subs r4, r12, #32 -; V7A-NEXT: lsr r3, lr, r3 -; V7A-NEXT: lslpl r3, lr, r4 -; V7A-NEXT: lsl r4, lr, r12 -; V7A-NEXT: movwpl r4, #0 -; V7A-NEXT: subs r4, r4, #1 -; V7A-NEXT: sbc r12, r3, #0 -; V7A-NEXT: rsb r3, r2, #32 -; V7A-NEXT: orr r0, r0, r1, lsl r3 -; V7A-NEXT: subs r3, r2, #32 -; V7A-NEXT: lsrpl r0, r1, r3 +; V7A-NEXT: rsb r12, lr, #32 +; V7A-NEXT: subs r4, lr, #32 +; V7A-NEXT: lsr r3, r5, r12 +; V7A-NEXT: lslpl r3, r5, r4 +; V7A-NEXT: lsl r5, r5, lr +; V7A-NEXT: movwpl r5, #0 +; V7A-NEXT: rsb r4, r2, #32 +; V7A-NEXT: subs r5, r5, #1 +; V7A-NEXT: sbc r3, r3, #0 +; V7A-NEXT: orr r0, r0, r1, lsl r4 +; V7A-NEXT: subs r4, r2, #32 +; V7A-NEXT: lsrpl r0, r1, r4 ; V7A-NEXT: lsr r1, r1, r2 ; V7A-NEXT: movwpl r1, #0 -; V7A-NEXT: and r0, r4, r0 -; V7A-NEXT: and r1, r12, r1 -; V7A-NEXT: pop {r4, pc} +; V7A-NEXT: and r0, r5, r0 +; V7A-NEXT: and r1, r3, r1 +; V7A-NEXT: pop {r4, r5, r11, pc} ; ; V7A-T-LABEL: bextr64_a0: ; V7A-T: @ %bb.0: @@ -434,28 +434,28 @@ define i64 @bextr64_a0_arithmetic(i64 %val, i64 %numskipbits, i64 %numlowbits) n ; ; V7A-LABEL: bextr64_a0_arithmetic: ; V7A: @ %bb.0: -; V7A-NEXT: .save {r4, lr} -; V7A-NEXT: push {r4, lr} -; V7A-NEXT: ldr r12, [sp, #8] -; V7A-NEXT: mov lr, #1 +; V7A-NEXT: .save {r4, r5, r11, lr} +; V7A-NEXT: push {r4, r5, r11, lr} +; V7A-NEXT: ldr lr, [sp, #16] +; V7A-NEXT: mov r5, #1 ; V7A-NEXT: lsr r0, r0, r2 -; V7A-NEXT: rsb r3, r12, #32 -; V7A-NEXT: subs r4, r12, #32 -; V7A-NEXT: lsr r3, lr, r3 -; V7A-NEXT: lslpl r3, lr, r4 -; V7A-NEXT: lsl r4, lr, r12 -; V7A-NEXT: movwpl r4, #0 -; V7A-NEXT: subs r4, r4, #1 -; V7A-NEXT: sbc r12, r3, #0 -; V7A-NEXT: rsb r3, r2, #32 -; V7A-NEXT: orr r0, r0, r1, lsl r3 -; V7A-NEXT: subs r3, r2, #32 +; V7A-NEXT: rsb r12, lr, #32 +; V7A-NEXT: subs r4, lr, #32 +; V7A-NEXT: lsr r3, r5, r12 +; V7A-NEXT: lslpl r3, r5, r4 +; V7A-NEXT: lsl r5, r5, lr +; V7A-NEXT: movwpl r5, #0 +; V7A-NEXT: rsb r4, r2, #32 +; V7A-NEXT: subs r5, r5, #1 +; V7A-NEXT: sbc r3, r3, #0 +; V7A-NEXT: orr r0, r0, r1, lsl r4 +; V7A-NEXT: subs r4, r2, #32 ; V7A-NEXT: asr r2, r1, r2 -; V7A-NEXT: asrpl r0, r1, r3 ; V7A-NEXT: asrpl r2, r1, #31 -; V7A-NEXT: and r0, r4, r0 -; V7A-NEXT: and r1, r12, r2 -; V7A-NEXT: pop {r4, pc} +; V7A-NEXT: asrpl r0, r1, r4 +; V7A-NEXT: and r1, r3, r2 +; V7A-NEXT: and r0, r5, r0 +; V7A-NEXT: pop {r4, r5, r11, pc} ; ; V7A-T-LABEL: bextr64_a0_arithmetic: ; V7A-T: @ %bb.0: @@ -911,28 +911,28 @@ define i64 @bextr64_a4_commutative(i64 %val, i64 %numskipbits, i64 %numlowbits) ; ; V7A-LABEL: bextr64_a4_commutative: ; V7A: @ %bb.0: -; V7A-NEXT: .save {r4, lr} -; V7A-NEXT: push {r4, lr} -; V7A-NEXT: ldr r12, [sp, #8] -; V7A-NEXT: mov lr, #1 +; V7A-NEXT: .save {r4, r5, r11, lr} +; V7A-NEXT: push {r4, r5, r11, lr} +; V7A-NEXT: ldr lr, [sp, #16] +; V7A-NEXT: mov r5, #1 ; V7A-NEXT: lsr r0, r0, r2 -; V7A-NEXT: rsb r3, r12, #32 -; V7A-NEXT: subs r4, r12, #32 -; V7A-NEXT: lsr r3, lr, r3 -; V7A-NEXT: lslpl r3, lr, r4 -; V7A-NEXT: lsl r4, lr, r12 -; V7A-NEXT: movwpl r4, #0 -; V7A-NEXT: subs r4, r4, #1 -; V7A-NEXT: sbc r12, r3, #0 -; V7A-NEXT: rsb r3, r2, #32 -; V7A-NEXT: orr r0, r0, r1, lsl r3 -; V7A-NEXT: subs r3, r2, #32 -; V7A-NEXT: lsrpl r0, r1, r3 +; V7A-NEXT: rsb r12, lr, #32 +; V7A-NEXT: subs r4, lr, #32 +; V7A-NEXT: lsr r3, r5, r12 +; V7A-NEXT: lslpl r3, r5, r4 +; V7A-NEXT: lsl r5, r5, lr +; V7A-NEXT: movwpl r5, #0 +; V7A-NEXT: rsb r4, r2, #32 +; V7A-NEXT: subs r5, r5, #1 +; V7A-NEXT: sbc r3, r3, #0 +; V7A-NEXT: orr r0, r0, r1, lsl r4 +; V7A-NEXT: subs r4, r2, #32 +; V7A-NEXT: lsrpl r0, r1, r4 ; V7A-NEXT: lsr r1, r1, r2 ; V7A-NEXT: movwpl r1, #0 -; V7A-NEXT: and r0, r0, r4 -; V7A-NEXT: and r1, r1, r12 -; V7A-NEXT: pop {r4, pc} +; V7A-NEXT: and r0, r0, r5 +; V7A-NEXT: and r1, r1, r3 +; V7A-NEXT: pop {r4, r5, r11, pc} ; ; V7A-T-LABEL: bextr64_a4_commutative: ; V7A-T: @ %bb.0: @@ -3456,22 +3456,22 @@ define i64 @bextr64_d1_indexzext(i64 %val, i8 %numskipbits, i8 %numlowbits) noun ; V7M-NEXT: uxtb r2, r2 ; V7M-NEXT: it pl ; V7M-NEXT: movpl r1, #0 -; V7M-NEXT: rsb.w r12, r2, #32 +; V7M-NEXT: rsb.w r3, r2, #32 ; V7M-NEXT: lsls r1, r2 -; V7M-NEXT: sub.w r3, r2, #32 -; V7M-NEXT: lsr.w r4, r0, r12 +; V7M-NEXT: sub.w r12, r2, #32 +; V7M-NEXT: lsr.w r4, r0, r3 ; V7M-NEXT: orrs r1, r4 -; V7M-NEXT: cmp r3, #0 +; V7M-NEXT: cmp.w r12, #0 ; V7M-NEXT: it pl -; V7M-NEXT: lslpl.w r1, r0, r3 +; V7M-NEXT: lslpl.w r1, r0, r12 ; V7M-NEXT: lsl.w r0, r0, r2 -; V7M-NEXT: lsl.w r4, r1, r12 +; V7M-NEXT: lsl.w r3, r1, r3 ; V7M-NEXT: it pl ; V7M-NEXT: movpl r0, #0 ; V7M-NEXT: lsr.w r0, r0, r2 -; V7M-NEXT: orr.w r0, r0, r4 +; V7M-NEXT: orr.w r0, r0, r3 ; V7M-NEXT: it pl -; V7M-NEXT: lsrpl.w r0, r1, r3 +; V7M-NEXT: lsrpl.w r0, r1, r12 ; V7M-NEXT: lsr.w r1, r1, r2 ; V7M-NEXT: it pl ; V7M-NEXT: movpl r1, #0 @@ -3715,26 +3715,26 @@ define i64 @bextr64_d3_load_indexzext(ptr %w, i8 %numskipbits, i8 %numlowbits) n ; V7M-NEXT: uxtb r2, r2 ; V7M-NEXT: lsl.w r0, lr, r0 ; V7M-NEXT: orr.w r0, r0, r12 -; V7M-NEXT: rsb.w r12, r2, #32 +; V7M-NEXT: sub.w r12, r2, #32 ; V7M-NEXT: it pl ; V7M-NEXT: lsrpl.w r0, lr, r3 ; V7M-NEXT: it pl ; V7M-NEXT: movpl r1, #0 +; V7M-NEXT: rsb.w r3, r2, #32 ; V7M-NEXT: lsls r1, r2 -; V7M-NEXT: sub.w r3, r2, #32 -; V7M-NEXT: lsr.w r4, r0, r12 -; V7M-NEXT: orrs r1, r4 -; V7M-NEXT: cmp r3, #0 +; V7M-NEXT: cmp.w r12, #0 +; V7M-NEXT: lsr.w r4, r0, r3 +; V7M-NEXT: orr.w r1, r1, r4 ; V7M-NEXT: it pl -; V7M-NEXT: lslpl.w r1, r0, r3 +; V7M-NEXT: lslpl.w r1, r0, r12 ; V7M-NEXT: lsl.w r0, r0, r2 -; V7M-NEXT: lsl.w r4, r1, r12 ; V7M-NEXT: it pl ; V7M-NEXT: movpl r0, #0 +; V7M-NEXT: lsl.w r3, r1, r3 ; V7M-NEXT: lsr.w r0, r0, r2 -; V7M-NEXT: orr.w r0, r0, r4 +; V7M-NEXT: orr.w r0, r0, r3 ; V7M-NEXT: it pl -; V7M-NEXT: lsrpl.w r0, r1, r3 +; V7M-NEXT: lsrpl.w r0, r1, r12 ; V7M-NEXT: lsr.w r1, r1, r2 ; V7M-NEXT: it pl ; V7M-NEXT: movpl r1, #0 diff --git a/llvm/test/CodeGen/ARM/extract-lowbits.ll b/llvm/test/CodeGen/ARM/extract-lowbits.ll index b483793..373d998 100644 --- a/llvm/test/CodeGen/ARM/extract-lowbits.ll +++ b/llvm/test/CodeGen/ARM/extract-lowbits.ll @@ -243,15 +243,15 @@ define i64 @bzhi64_a0(i64 %val, i64 %numlowbits) nounwind { ; V7A: @ %bb.0: ; V7A-NEXT: .save {r11, lr} ; V7A-NEXT: push {r11, lr} -; V7A-NEXT: rsb r3, r2, #32 -; V7A-NEXT: mov r12, #1 -; V7A-NEXT: lsr lr, r12, r3 +; V7A-NEXT: rsb r12, r2, #32 +; V7A-NEXT: mov lr, #1 ; V7A-NEXT: subs r3, r2, #32 -; V7A-NEXT: lsl r2, r12, r2 +; V7A-NEXT: lsl r2, lr, r2 +; V7A-NEXT: lsr r12, lr, r12 ; V7A-NEXT: movwpl r2, #0 -; V7A-NEXT: lslpl lr, r12, r3 +; V7A-NEXT: lslpl r12, lr, r3 ; V7A-NEXT: subs r2, r2, #1 -; V7A-NEXT: sbc r3, lr, #0 +; V7A-NEXT: sbc r3, r12, #0 ; V7A-NEXT: and r0, r2, r0 ; V7A-NEXT: and r1, r3, r1 ; V7A-NEXT: pop {r11, pc} @@ -323,15 +323,15 @@ define i64 @bzhi64_a0_masked(i64 %val, i64 %numlowbits) nounwind { ; V7A-NEXT: .save {r11, lr} ; V7A-NEXT: push {r11, lr} ; V7A-NEXT: and r2, r2, #63 -; V7A-NEXT: mov r12, #1 -; V7A-NEXT: rsb r3, r2, #32 -; V7A-NEXT: lsr lr, r12, r3 +; V7A-NEXT: mov lr, #1 +; V7A-NEXT: rsb r12, r2, #32 ; V7A-NEXT: subs r3, r2, #32 -; V7A-NEXT: lsl r2, r12, r2 +; V7A-NEXT: lsl r2, lr, r2 +; V7A-NEXT: lsr r12, lr, r12 ; V7A-NEXT: movwpl r2, #0 -; V7A-NEXT: lslpl lr, r12, r3 +; V7A-NEXT: lslpl r12, lr, r3 ; V7A-NEXT: subs r2, r2, #1 -; V7A-NEXT: sbc r3, lr, #0 +; V7A-NEXT: sbc r3, r12, #0 ; V7A-NEXT: and r0, r2, r0 ; V7A-NEXT: and r1, r3, r1 ; V7A-NEXT: pop {r11, pc} @@ -404,15 +404,15 @@ define i64 @bzhi64_a1_indexzext(i64 %val, i8 zeroext %numlowbits) nounwind { ; V7A: @ %bb.0: ; V7A-NEXT: .save {r11, lr} ; V7A-NEXT: push {r11, lr} -; V7A-NEXT: rsb r3, r2, #32 -; V7A-NEXT: mov r12, #1 -; V7A-NEXT: lsr lr, r12, r3 +; V7A-NEXT: rsb r12, r2, #32 +; V7A-NEXT: mov lr, #1 ; V7A-NEXT: subs r3, r2, #32 -; V7A-NEXT: lsl r2, r12, r2 +; V7A-NEXT: lsl r2, lr, r2 +; V7A-NEXT: lsr r12, lr, r12 ; V7A-NEXT: movwpl r2, #0 -; V7A-NEXT: lslpl lr, r12, r3 +; V7A-NEXT: lslpl r12, lr, r3 ; V7A-NEXT: subs r2, r2, #1 -; V7A-NEXT: sbc r3, lr, #0 +; V7A-NEXT: sbc r3, r12, #0 ; V7A-NEXT: and r0, r2, r0 ; V7A-NEXT: and r1, r3, r1 ; V7A-NEXT: pop {r11, pc} @@ -644,15 +644,15 @@ define i64 @bzhi64_a4_commutative(i64 %val, i64 %numlowbits) nounwind { ; V7A: @ %bb.0: ; V7A-NEXT: .save {r11, lr} ; V7A-NEXT: push {r11, lr} -; V7A-NEXT: rsb r3, r2, #32 -; V7A-NEXT: mov r12, #1 -; V7A-NEXT: lsr lr, r12, r3 +; V7A-NEXT: rsb r12, r2, #32 +; V7A-NEXT: mov lr, #1 ; V7A-NEXT: subs r3, r2, #32 -; V7A-NEXT: lsl r2, r12, r2 +; V7A-NEXT: lsl r2, lr, r2 +; V7A-NEXT: lsr r12, lr, r12 ; V7A-NEXT: movwpl r2, #0 -; V7A-NEXT: lslpl lr, r12, r3 +; V7A-NEXT: lslpl r12, lr, r3 ; V7A-NEXT: subs r2, r2, #1 -; V7A-NEXT: sbc r3, lr, #0 +; V7A-NEXT: sbc r3, r12, #0 ; V7A-NEXT: and r0, r0, r2 ; V7A-NEXT: and r1, r1, r3 ; V7A-NEXT: pop {r11, pc} @@ -2144,23 +2144,23 @@ define i64 @bzhi64_d2_load(ptr %w, i64 %numlowbits) nounwind { ; ; V7A-LABEL: bzhi64_d2_load: ; V7A: @ %bb.0: -; V7A-NEXT: .save {r5, r7, r11, lr} -; V7A-NEXT: push {r5, r7, r11, lr} +; V7A-NEXT: .save {r5, lr} +; V7A-NEXT: push {r5, lr} ; V7A-NEXT: rsb r3, r2, #64 -; V7A-NEXT: ldm r0, {r0, r7} -; V7A-NEXT: rsb r1, r3, #32 +; V7A-NEXT: ldm r0, {r0, r5} +; V7A-NEXT: rsb r12, r3, #32 ; V7A-NEXT: rsbs r2, r2, #32 -; V7A-NEXT: lsr r5, r0, r1 -; V7A-NEXT: orr r7, r5, r7, lsl r3 -; V7A-NEXT: lslpl r7, r0, r2 +; V7A-NEXT: lsr r1, r0, r12 +; V7A-NEXT: orr r1, r1, r5, lsl r3 +; V7A-NEXT: lslpl r1, r0, r2 ; V7A-NEXT: lsl r0, r0, r3 ; V7A-NEXT: movwpl r0, #0 ; V7A-NEXT: lsr r0, r0, r3 -; V7A-NEXT: orr r0, r0, r7, lsl r1 -; V7A-NEXT: lsr r1, r7, r3 -; V7A-NEXT: lsrpl r0, r7, r2 +; V7A-NEXT: orr r0, r0, r1, lsl r12 +; V7A-NEXT: lsrpl r0, r1, r2 +; V7A-NEXT: lsr r1, r1, r3 ; V7A-NEXT: movwpl r1, #0 -; V7A-NEXT: pop {r5, r7, r11, pc} +; V7A-NEXT: pop {r5, pc} ; ; V7A-T-LABEL: bzhi64_d2_load: ; V7A-T: @ %bb.0: @@ -2237,26 +2237,26 @@ define i64 @bzhi64_d3_load_indexzext(ptr %w, i8 %numlowbits) nounwind { ; ; V7A-LABEL: bzhi64_d3_load_indexzext: ; V7A: @ %bb.0: -; V7A-NEXT: .save {r5, r7, r11, lr} -; V7A-NEXT: push {r5, r7, r11, lr} +; V7A-NEXT: .save {r5, lr} +; V7A-NEXT: push {r5, lr} ; V7A-NEXT: rsb r1, r1, #64 -; V7A-NEXT: ldm r0, {r0, r7} +; V7A-NEXT: ldm r0, {r0, r5} ; V7A-NEXT: uxtb r2, r1 -; V7A-NEXT: rsb r3, r2, #32 -; V7A-NEXT: lsr r5, r0, r3 -; V7A-NEXT: orr r7, r5, r7, lsl r2 +; V7A-NEXT: rsb r12, r2, #32 +; V7A-NEXT: lsr r3, r0, r12 +; V7A-NEXT: orr r3, r3, r5, lsl r2 ; V7A-NEXT: mvn r5, #31 ; V7A-NEXT: uxtab r1, r5, r1 ; V7A-NEXT: cmp r1, #0 -; V7A-NEXT: lslpl r7, r0, r1 +; V7A-NEXT: lslpl r3, r0, r1 ; V7A-NEXT: lsl r0, r0, r2 ; V7A-NEXT: movwpl r0, #0 ; V7A-NEXT: lsr r0, r0, r2 -; V7A-NEXT: orr r0, r0, r7, lsl r3 -; V7A-NEXT: lsrpl r0, r7, r1 -; V7A-NEXT: lsr r1, r7, r2 +; V7A-NEXT: orr r0, r0, r3, lsl r12 +; V7A-NEXT: lsrpl r0, r3, r1 +; V7A-NEXT: lsr r1, r3, r2 ; V7A-NEXT: movwpl r1, #0 -; V7A-NEXT: pop {r5, r7, r11, pc} +; V7A-NEXT: pop {r5, pc} ; ; V7A-T-LABEL: bzhi64_d3_load_indexzext: ; V7A-T: @ %bb.0: diff --git a/llvm/test/CodeGen/ARM/llround-conv.ll b/llvm/test/CodeGen/ARM/llround-conv.ll index 0f57e4a..f734db8 100644 --- a/llvm/test/CodeGen/ARM/llround-conv.ll +++ b/llvm/test/CodeGen/ARM/llround-conv.ll @@ -1,25 +1,71 @@ -; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft | FileCheck %s --check-prefix=SOFTFP -; RUN: llc < %s -mtriple=arm-eabi -float-abi=hard | FileCheck %s --check-prefix=HARDFP +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=armv7-none-eabi -float-abi=soft | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT +; RUN: llc < %s -mtriple=armv7-none-eabihf -mattr=+vfp2 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16 +; RUN: llc < %s -mtriple=armv8-none-eabihf -mattr=+fp-armv8,+fullfp16 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FP16 + +define i64 @testmsxh_builtin(half %x) { +; CHECK-SOFT-LABEL: testmsxh_builtin: +; CHECK-SOFT: @ %bb.0: @ %entry +; CHECK-SOFT-NEXT: .save {r11, lr} +; CHECK-SOFT-NEXT: push {r11, lr} +; CHECK-SOFT-NEXT: bl __aeabi_h2f +; CHECK-SOFT-NEXT: bl llroundf +; CHECK-SOFT-NEXT: pop {r11, pc} +; +; CHECK-NOFP16-LABEL: testmsxh_builtin: +; CHECK-NOFP16: @ %bb.0: @ %entry +; CHECK-NOFP16-NEXT: .save {r11, lr} +; CHECK-NOFP16-NEXT: push {r11, lr} +; CHECK-NOFP16-NEXT: vmov r0, s0 +; CHECK-NOFP16-NEXT: bl __aeabi_h2f +; CHECK-NOFP16-NEXT: vmov s0, r0 +; CHECK-NOFP16-NEXT: bl llroundf +; CHECK-NOFP16-NEXT: pop {r11, pc} +; +; CHECK-FP16-LABEL: testmsxh_builtin: +; CHECK-FP16: @ %bb.0: @ %entry +; CHECK-FP16-NEXT: .save {r11, lr} +; CHECK-FP16-NEXT: push {r11, lr} +; CHECK-FP16-NEXT: vcvtb.f32.f16 s0, s0 +; CHECK-FP16-NEXT: bl llroundf +; CHECK-FP16-NEXT: pop {r11, pc} +entry: + %0 = tail call i64 @llvm.llround.i64.f16(half %x) + ret i64 %0 +} -; SOFTFP-LABEL: testmsxs_builtin: -; SOFTFP: bl llroundf -; HARDFP-LABEL: testmsxs_builtin: -; HARDFP: bl llroundf define i64 @testmsxs_builtin(float %x) { +; CHECK-LABEL: testmsxs_builtin: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl llroundf +; CHECK-NEXT: pop {r11, pc} entry: - %0 = tail call i64 @llvm.llround.f32(float %x) + %0 = tail call i64 @llvm.llround.i64.f32(float %x) ret i64 %0 } -; SOFTFP-LABEL: testmsxd_builtin: -; SOFTFP: bl llround -; HARDFP-LABEL: testmsxd_builtin: -; HARDFP: bl llround define i64 @testmsxd_builtin(double %x) { +; CHECK-LABEL: testmsxd_builtin: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl llround +; CHECK-NEXT: pop {r11, pc} entry: - %0 = tail call i64 @llvm.llround.f64(double %x) + %0 = tail call i64 @llvm.llround.i64.f64(double %x) ret i64 %0 } -declare i64 @llvm.llround.f32(float) nounwind readnone -declare i64 @llvm.llround.f64(double) nounwind readnone +define i64 @testmsxq_builtin(fp128 %x) { +; CHECK-LABEL: testmsxq_builtin: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl llroundl +; CHECK-NEXT: pop {r11, pc} +entry: + %0 = tail call i64 @llvm.llround.i64.f128(fp128 %x) + ret i64 %0 +} diff --git a/llvm/test/CodeGen/ARM/lround-conv.ll b/llvm/test/CodeGen/ARM/lround-conv.ll index 3aaed74..03f7a0d 100644 --- a/llvm/test/CodeGen/ARM/lround-conv.ll +++ b/llvm/test/CodeGen/ARM/lround-conv.ll @@ -1,25 +1,47 @@ -; RUN: llc < %s -mtriple=arm-eabi -float-abi=soft | FileCheck %s --check-prefix=SOFTFP -; RUN: llc < %s -mtriple=arm-eabi -float-abi=hard | FileCheck %s --check-prefix=HARDFP +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc < %s -mtriple=armv7-none-eabi -float-abi=soft | FileCheck %s --check-prefixes=CHECK,CHECK-SOFT +; RUN: llc < %s -mtriple=armv7-none-eabihf -mattr=+vfp2 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-NOFP16 +; RUN: llc < %s -mtriple=armv8-none-eabihf -mattr=+fp-armv8 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FPv8 +; RUN: llc < %s -mtriple=armv8-none-eabihf -mattr=+fp-armv8,+fullfp16 -float-abi=hard | FileCheck %s --check-prefixes=CHECK,CHECK-FP16 + +;define i32 @testmswh_builtin(half %x) { +;entry: +; %0 = tail call i32 @llvm.lround.i32.f16(half %x) +; ret i32 %0 +;} -; SOFTFP-LABEL: testmsws_builtin: -; SOFTFP: bl lroundf -; HARDFP-LABEL: testmsws_builtin: -; HARDFP: bl lroundf define i32 @testmsws_builtin(float %x) { +; CHECK-LABEL: testmsws_builtin: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: b lroundf entry: %0 = tail call i32 @llvm.lround.i32.f32(float %x) ret i32 %0 } -; SOFTFP-LABEL: testmswd_builtin: -; SOFTFP: bl lround -; HARDFP-LABEL: testmswd_builtin: -; HARDFP: bl lround define i32 @testmswd_builtin(double %x) { +; CHECK-LABEL: testmswd_builtin: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: b lround entry: %0 = tail call i32 @llvm.lround.i32.f64(double %x) ret i32 %0 } -declare i32 @llvm.lround.i32.f32(float) nounwind readnone -declare i32 @llvm.lround.i32.f64(double) nounwind readnone +define i32 @testmswq_builtin(fp128 %x) { +; CHECK-LABEL: testmswq_builtin: +; CHECK: @ %bb.0: @ %entry +; CHECK-NEXT: .save {r11, lr} +; CHECK-NEXT: push {r11, lr} +; CHECK-NEXT: bl lroundl +; CHECK-NEXT: pop {r11, pc} +entry: + %0 = tail call i32 @llvm.lround.i32.f128(fp128 %x) + ret i32 %0 +} + +;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line: +; CHECK-FP16: {{.*}} +; CHECK-FPv8: {{.*}} +; CHECK-NOFP16: {{.*}} +; CHECK-SOFT: {{.*}} diff --git a/llvm/test/CodeGen/RISCV/pr69586.ll b/llvm/test/CodeGen/RISCV/pr69586.ll index e761d3a..33b89a4 100644 --- a/llvm/test/CodeGen/RISCV/pr69586.ll +++ b/llvm/test/CodeGen/RISCV/pr69586.ll @@ -39,119 +39,118 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: slli a2, a2, 1 ; NOREMAT-NEXT: sub sp, sp, a2 ; NOREMAT-NEXT: .cfi_escape 0x0f, 0x0e, 0x72, 0x00, 0x11, 0xf0, 0x05, 0x22, 0x11, 0x02, 0x92, 0xa2, 0x38, 0x00, 0x1e, 0x22 # sp + 752 + 2 * vlenb -; NOREMAT-NEXT: mv a7, a0 -; NOREMAT-NEXT: li a0, 32 -; NOREMAT-NEXT: addi a5, a7, 512 -; NOREMAT-NEXT: addi a4, a7, 1024 -; NOREMAT-NEXT: addi a6, a7, 1536 -; NOREMAT-NEXT: li t4, 1 -; NOREMAT-NEXT: li a2, 5 +; NOREMAT-NEXT: li a7, 32 +; NOREMAT-NEXT: addi s10, a0, 512 +; NOREMAT-NEXT: addi a4, a0, 1024 +; NOREMAT-NEXT: addi a6, a0, 1536 +; NOREMAT-NEXT: li t0, 1 +; NOREMAT-NEXT: li a3, 5 ; NOREMAT-NEXT: li t1, 3 -; NOREMAT-NEXT: li t0, 7 -; NOREMAT-NEXT: lui t5, 1 +; NOREMAT-NEXT: li a2, 7 +; NOREMAT-NEXT: lui t2, 1 ; NOREMAT-NEXT: li s4, 9 ; NOREMAT-NEXT: li s6, 11 ; NOREMAT-NEXT: li s9, 13 ; NOREMAT-NEXT: li ra, 15 -; NOREMAT-NEXT: lui t2, 2 +; NOREMAT-NEXT: lui a5, 2 ; NOREMAT-NEXT: lui s1, 3 ; NOREMAT-NEXT: lui t3, 4 ; NOREMAT-NEXT: lui s0, 5 ; NOREMAT-NEXT: lui s3, 6 ; NOREMAT-NEXT: lui s7, 7 -; NOREMAT-NEXT: vsetvli zero, a0, e32, m2, ta, ma -; NOREMAT-NEXT: slli t4, t4, 11 -; NOREMAT-NEXT: sd t4, 512(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: slli a3, a2, 9 -; NOREMAT-NEXT: sd a3, 504(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: vsetvli zero, a7, e32, m2, ta, ma +; NOREMAT-NEXT: slli t0, t0, 11 +; NOREMAT-NEXT: sd t0, 512(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: slli t4, a3, 9 +; NOREMAT-NEXT: sd t4, 504(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: slli t6, t1, 10 -; NOREMAT-NEXT: slli s2, t0, 9 -; NOREMAT-NEXT: add a0, a7, t5 +; NOREMAT-NEXT: slli s2, a2, 9 +; NOREMAT-NEXT: add a7, a0, t2 ; NOREMAT-NEXT: lui s11, 1 ; NOREMAT-NEXT: slli s4, s4, 9 -; NOREMAT-NEXT: slli s5, a2, 10 +; NOREMAT-NEXT: slli s5, a3, 10 ; NOREMAT-NEXT: slli s6, s6, 9 ; NOREMAT-NEXT: slli s8, t1, 11 -; NOREMAT-NEXT: vle32.v v8, (a5) +; NOREMAT-NEXT: vle32.v v8, (s10) ; NOREMAT-NEXT: slli s9, s9, 9 ; NOREMAT-NEXT: li t5, 13 ; NOREMAT-NEXT: vle32.v v10, (a4) ; NOREMAT-NEXT: vle32.v v2, (a4) -; NOREMAT-NEXT: slli s10, t0, 10 +; NOREMAT-NEXT: slli s10, a2, 10 ; NOREMAT-NEXT: vle32.v v0, (a6) ; NOREMAT-NEXT: vle32.v v12, (a6) ; NOREMAT-NEXT: slli ra, ra, 9 -; NOREMAT-NEXT: vle32.v v4, (a0) -; NOREMAT-NEXT: vle32.v v20, (a0) -; NOREMAT-NEXT: add a4, a7, t2 +; NOREMAT-NEXT: vle32.v v4, (a7) +; NOREMAT-NEXT: vle32.v v20, (a7) +; NOREMAT-NEXT: add a4, a0, a5 ; NOREMAT-NEXT: vle32.v v6, (a4) ; NOREMAT-NEXT: vle32.v v30, (a4) -; NOREMAT-NEXT: add a4, a7, s1 +; NOREMAT-NEXT: add a4, a0, s1 ; NOREMAT-NEXT: vle32.v v28, (a4) ; NOREMAT-NEXT: vle32.v v26, (a4) -; NOREMAT-NEXT: add a4, a7, t3 +; NOREMAT-NEXT: add a4, a0, t3 ; NOREMAT-NEXT: vle32.v v24, (a4) ; NOREMAT-NEXT: vle32.v v22, (a4) -; NOREMAT-NEXT: add a4, a7, s0 -; NOREMAT-NEXT: vle32.v v14, (a7) +; NOREMAT-NEXT: add a4, a0, s0 +; NOREMAT-NEXT: vle32.v v14, (a0) ; NOREMAT-NEXT: vle32.v v18, (a4) ; NOREMAT-NEXT: vle32.v v16, (a4) -; NOREMAT-NEXT: add a4, a7, s3 +; NOREMAT-NEXT: add a4, a0, s3 ; NOREMAT-NEXT: sf.vc.vv 3, 0, v14, v8 ; NOREMAT-NEXT: vle32.v v14, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v10 ; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: addi a0, sp, 640 -; NOREMAT-NEXT: vs2r.v v8, (a0) # vscale x 16-byte Folded Spill -; NOREMAT-NEXT: add a4, a7, t4 +; NOREMAT-NEXT: addi a4, sp, 640 +; NOREMAT-NEXT: vs2r.v v8, (a4) # vscale x 16-byte Folded Spill +; NOREMAT-NEXT: add a4, a0, t0 ; NOREMAT-NEXT: vle32.v v10, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v0 ; NOREMAT-NEXT: vle32.v v2, (a4) -; NOREMAT-NEXT: add a4, a7, a3 +; NOREMAT-NEXT: add a4, a0, t4 ; NOREMAT-NEXT: vle32.v v0, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v10 ; NOREMAT-NEXT: vle32.v v10, (a4) -; NOREMAT-NEXT: add a4, a7, t6 +; NOREMAT-NEXT: add a4, a0, t6 ; NOREMAT-NEXT: vle32.v v12, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v0 ; NOREMAT-NEXT: vle32.v v2, (a4) -; NOREMAT-NEXT: add a4, a7, s2 +; NOREMAT-NEXT: add a4, a0, s2 ; NOREMAT-NEXT: vle32.v v8, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v10, v12 ; NOREMAT-NEXT: vle32.v v12, (a4) -; NOREMAT-NEXT: add a4, a7, s7 +; NOREMAT-NEXT: add a4, a0, s7 ; NOREMAT-NEXT: vle32.v v0, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v8 ; NOREMAT-NEXT: vle32.v v10, (a4) -; NOREMAT-NEXT: add a4, a7, s4 +; NOREMAT-NEXT: add a4, a0, s4 ; NOREMAT-NEXT: vle32.v v8, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 ; NOREMAT-NEXT: vle32.v v12, (a4) -; NOREMAT-NEXT: add a4, a7, s5 +; NOREMAT-NEXT: add a4, a0, s5 ; NOREMAT-NEXT: vle32.v v4, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v20, v8 ; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: add a4, a7, s6 +; NOREMAT-NEXT: add a4, a0, s6 ; NOREMAT-NEXT: vle32.v v20, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 ; NOREMAT-NEXT: vle32.v v12, (a4) -; NOREMAT-NEXT: add a4, a7, s8 +; NOREMAT-NEXT: add a4, a0, s8 ; NOREMAT-NEXT: vle32.v v4, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v20 ; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: add a4, a7, s9 +; NOREMAT-NEXT: add a4, a0, s9 ; NOREMAT-NEXT: vle32.v v20, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 ; NOREMAT-NEXT: vle32.v v12, (a4) -; NOREMAT-NEXT: add a4, a7, s10 +; NOREMAT-NEXT: add a4, a0, s10 ; NOREMAT-NEXT: vle32.v v4, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v20 ; NOREMAT-NEXT: vle32.v v8, (a4) -; NOREMAT-NEXT: add a4, a7, ra +; NOREMAT-NEXT: add a4, a0, ra ; NOREMAT-NEXT: vle32.v v2, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v4 ; NOREMAT-NEXT: lui t4, 8 -; NOREMAT-NEXT: add a5, a7, t4 +; NOREMAT-NEXT: add a5, a0, t4 ; NOREMAT-NEXT: vle32.v v20, (a5) ; NOREMAT-NEXT: vle32.v v12, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v8, v2 @@ -159,14 +158,14 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: slli a4, a4, 9 ; NOREMAT-NEXT: li s1, 17 ; NOREMAT-NEXT: sd a4, 624(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a4, a7, a4 +; NOREMAT-NEXT: add a4, a0, a4 ; NOREMAT-NEXT: vle32.v v8, (a4) ; NOREMAT-NEXT: vle32.v v4, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v6 ; NOREMAT-NEXT: li a5, 9 ; NOREMAT-NEXT: slli a4, a5, 10 ; NOREMAT-NEXT: sd a4, 616(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a4, a7, a4 +; NOREMAT-NEXT: add a4, a0, a4 ; NOREMAT-NEXT: vle32.v v12, (a4) ; NOREMAT-NEXT: vle32.v v6, (a4) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8 @@ -174,256 +173,257 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: slli a4, a4, 9 ; NOREMAT-NEXT: li t2, 19 ; NOREMAT-NEXT: sd a4, 608(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a4, a7, a4 +; NOREMAT-NEXT: add a4, a0, a4 ; NOREMAT-NEXT: vle32.v v8, (a4) ; NOREMAT-NEXT: vle32.v v30, (a4) -; NOREMAT-NEXT: slli a3, a2, 11 +; NOREMAT-NEXT: slli a3, a3, 11 ; NOREMAT-NEXT: sd a3, 600(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v12 -; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: add a3, a0, a3 ; NOREMAT-NEXT: vle32.v v12, (a3) ; NOREMAT-NEXT: vle32.v v4, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8 ; NOREMAT-NEXT: li s7, 21 ; NOREMAT-NEXT: slli a3, s7, 9 ; NOREMAT-NEXT: sd a3, 592(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: add a3, a0, a3 ; NOREMAT-NEXT: vle32.v v8, (a3) ; NOREMAT-NEXT: vle32.v v6, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12 ; NOREMAT-NEXT: li a6, 11 ; NOREMAT-NEXT: slli a3, a6, 10 ; NOREMAT-NEXT: sd a3, 584(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: add a3, a0, a3 ; NOREMAT-NEXT: vle32.v v12, (a3) ; NOREMAT-NEXT: vle32.v v30, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v8 ; NOREMAT-NEXT: li s3, 23 ; NOREMAT-NEXT: slli a3, s3, 9 ; NOREMAT-NEXT: sd a3, 576(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: add a3, a0, a3 ; NOREMAT-NEXT: vle32.v v8, (a3) ; NOREMAT-NEXT: vle32.v v4, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v12 ; NOREMAT-NEXT: li s0, 25 ; NOREMAT-NEXT: slli a3, s0, 9 ; NOREMAT-NEXT: sd a3, 568(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: add a3, a0, a3 ; NOREMAT-NEXT: vle32.v v12, (a3) ; NOREMAT-NEXT: vle32.v v6, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8 ; NOREMAT-NEXT: slli a3, t5, 10 ; NOREMAT-NEXT: sd a3, 560(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: add a3, a0, a3 ; NOREMAT-NEXT: vle32.v v8, (a3) ; NOREMAT-NEXT: vle32.v v30, (a3) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v28 ; NOREMAT-NEXT: li t3, 27 ; NOREMAT-NEXT: slli a3, t3, 9 ; NOREMAT-NEXT: sd a3, 552(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a3, a7, a3 +; NOREMAT-NEXT: add a3, a0, a3 ; NOREMAT-NEXT: vle32.v v28, (a3) ; NOREMAT-NEXT: vle32.v v4, (a3) -; NOREMAT-NEXT: slli a2, t0, 11 +; NOREMAT-NEXT: slli a2, a2, 11 ; NOREMAT-NEXT: sd a2, 544(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v12 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8 ; NOREMAT-NEXT: li t0, 29 ; NOREMAT-NEXT: slli a2, t0, 9 ; NOREMAT-NEXT: sd a2, 536(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v28 -; NOREMAT-NEXT: li a3, 15 -; NOREMAT-NEXT: slli a2, a3, 10 +; NOREMAT-NEXT: li a7, 15 +; NOREMAT-NEXT: slli a2, a7, 10 ; NOREMAT-NEXT: sd a2, 528(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v28, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v12 ; NOREMAT-NEXT: li t1, 31 ; NOREMAT-NEXT: slli a2, t1, 9 ; NOREMAT-NEXT: sd a2, 520(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: vle32.v v4, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v8 -; NOREMAT-NEXT: lui a4, 4 -; NOREMAT-NEXT: addi a0, a4, 512 -; NOREMAT-NEXT: sd a0, 496(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a0, a7, a0 -; NOREMAT-NEXT: vle32.v v8, (a0) -; NOREMAT-NEXT: vle32.v v26, (a0) +; NOREMAT-NEXT: lui a3, 4 +; NOREMAT-NEXT: addi a2, a3, 512 +; NOREMAT-NEXT: sd a2, 496(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a2, a0, a2 +; NOREMAT-NEXT: vle32.v v8, (a2) +; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v28 ; NOREMAT-NEXT: slli a2, s1, 10 ; NOREMAT-NEXT: sd a2, 488(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v28, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v12 -; NOREMAT-NEXT: addi a2, a4, 1536 +; NOREMAT-NEXT: addi a2, a3, 1536 ; NOREMAT-NEXT: sd a2, 480(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: lui a4, 4 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) ; NOREMAT-NEXT: slli a2, a5, 11 ; NOREMAT-NEXT: sd a2, 472(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v24 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v24, (a2) ; NOREMAT-NEXT: vle32.v v4, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v8 ; NOREMAT-NEXT: lui a5, 5 ; NOREMAT-NEXT: addi a2, a5, -1536 ; NOREMAT-NEXT: sd a2, 464(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v22, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v28 ; NOREMAT-NEXT: slli a2, t2, 10 ; NOREMAT-NEXT: sd a2, 456(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: li t2, 19 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: li a3, 19 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: vle32.v v28, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v12 ; NOREMAT-NEXT: addi a2, a5, -512 ; NOREMAT-NEXT: sd a2, 448(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v24 ; NOREMAT-NEXT: addi a2, a5, 512 ; NOREMAT-NEXT: sd a2, 440(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v24, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) ; NOREMAT-NEXT: slli a2, s7, 10 ; NOREMAT-NEXT: sd a2, 432(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v8 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v4, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v22, v26 ; NOREMAT-NEXT: addi a2, a5, 1536 ; NOREMAT-NEXT: sd a2, 424(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v22, (a2) ; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: slli a2, a6, 11 ; NOREMAT-NEXT: sd a2, 416(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v12 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: vle32.v v28, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v18 ; NOREMAT-NEXT: lui a6, 6 ; NOREMAT-NEXT: addi a2, a6, -1536 ; NOREMAT-NEXT: sd a2, 408(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v18, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: slli a2, s3, 10 ; NOREMAT-NEXT: sd a2, 400(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v16, v24 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v16, (a2) ; NOREMAT-NEXT: vle32.v v24, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v8 ; NOREMAT-NEXT: addi a2, a6, -512 ; NOREMAT-NEXT: sd a2, 392(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v22 ; NOREMAT-NEXT: addi a2, a6, 512 ; NOREMAT-NEXT: sd a2, 384(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v22, (a2) ; NOREMAT-NEXT: vle32.v v4, (a2) ; NOREMAT-NEXT: slli a2, s0, 10 ; NOREMAT-NEXT: sd a2, 376(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v26, v12 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: vle32.v v2, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v18 ; NOREMAT-NEXT: addi a2, a6, 1536 ; NOREMAT-NEXT: sd a2, 368(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v18, (a2) ; NOREMAT-NEXT: vle32.v v28, (a2) ; NOREMAT-NEXT: slli a2, t5, 11 ; NOREMAT-NEXT: sd a2, 360(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v16 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v16, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v8 ; NOREMAT-NEXT: lui s0, 7 ; NOREMAT-NEXT: addi a2, s0, -1536 ; NOREMAT-NEXT: sd a2, 352(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v24, (a2) ; NOREMAT-NEXT: slli a2, t3, 10 ; NOREMAT-NEXT: sd a2, 344(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v14 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v14, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) -; NOREMAT-NEXT: addi a0, sp, 640 -; NOREMAT-NEXT: vl2r.v v12, (a0) # vscale x 16-byte Folded Reload +; NOREMAT-NEXT: addi a2, sp, 640 +; NOREMAT-NEXT: vl2r.v v12, (a2) # vscale x 16-byte Folded Reload ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v22 ; NOREMAT-NEXT: addi a2, s0, -512 ; NOREMAT-NEXT: sd a2, 336(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v22, (a2) ; NOREMAT-NEXT: vle32.v v12, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v4, v26 ; NOREMAT-NEXT: addi a2, s0, 512 ; NOREMAT-NEXT: sd a2, 328(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: lui t3, 7 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v26, (a2) ; NOREMAT-NEXT: vle32.v v4, (a2) ; NOREMAT-NEXT: slli a2, t0, 10 ; NOREMAT-NEXT: sd a2, 320(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v2, v18 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v18, (a2) ; NOREMAT-NEXT: vle32.v v2, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v28, v16 ; NOREMAT-NEXT: addi a2, t3, 1536 ; NOREMAT-NEXT: sd a2, 312(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v16, (a2) ; NOREMAT-NEXT: vle32.v v28, (a2) -; NOREMAT-NEXT: slli a2, a3, 11 +; NOREMAT-NEXT: slli a2, a7, 11 ; NOREMAT-NEXT: sd a2, 304(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v6, v8 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v8, (a2) ; NOREMAT-NEXT: vle32.v v6, (a2) ; NOREMAT-NEXT: sf.vc.vv 3, 0, v24, v14 ; NOREMAT-NEXT: addi a2, t4, -1536 ; NOREMAT-NEXT: sd a2, 296(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v14, (a2) ; NOREMAT-NEXT: vle32.v v24, (a2) ; NOREMAT-NEXT: slli a2, t1, 10 ; NOREMAT-NEXT: sd a2, 288(sp) # 8-byte Folded Spill ; NOREMAT-NEXT: sf.vc.vv 3, 0, v30, v22 -; NOREMAT-NEXT: add a2, a7, a2 +; NOREMAT-NEXT: add a2, a0, a2 ; NOREMAT-NEXT: vle32.v v22, (a2) ; NOREMAT-NEXT: vle32.v v30, (a2) -; NOREMAT-NEXT: addi a0, t4, -512 -; NOREMAT-NEXT: sd a0, 280(sp) # 8-byte Folded Spill -; NOREMAT-NEXT: add a0, a7, a0 +; NOREMAT-NEXT: addi a2, t4, -512 +; NOREMAT-NEXT: sd a2, 280(sp) # 8-byte Folded Spill +; NOREMAT-NEXT: add a0, a0, a2 ; NOREMAT-NEXT: sf.vc.vv 3, 0, v12, v0 ; NOREMAT-NEXT: vle32.v v12, (a0) ; NOREMAT-NEXT: vle32.v v0, (a0) @@ -476,7 +476,7 @@ define void @test(ptr %0, ptr %1, i64 %2) { ; NOREMAT-NEXT: addi s11, a0, 512 ; NOREMAT-NEXT: addi s7, a0, 1024 ; NOREMAT-NEXT: addi s3, a0, 1536 -; NOREMAT-NEXT: slli s1, t2, 11 +; NOREMAT-NEXT: slli s1, a3, 11 ; NOREMAT-NEXT: lui a0, 10 ; NOREMAT-NEXT: addi t2, a0, -1536 ; NOREMAT-NEXT: addi a7, a0, -1024 diff --git a/llvm/test/CodeGen/RISCV/rv64-trampoline.ll b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll index 34d4657..c68fa59 100644 --- a/llvm/test/CodeGen/RISCV/rv64-trampoline.ll +++ b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll @@ -78,3 +78,10 @@ define i64 @test0(i64 %n, ptr %p) nounwind { ret i64 %ret } + +; Check for the explicitly emitted .note.GNU-stack section (ELF only) in the +; presence of trampolines. +; UTC_ARGS: --disable +; RV64-LINUX: .section ".note.GNU-stack","x",@progbits +; RV64: .section ".note.GNU-stack","x",@progbits +; UTC_ARGS: --enable diff --git a/llvm/test/CodeGen/SystemZ/llvm.sincos.ll b/llvm/test/CodeGen/SystemZ/llvm.sincos.ll index 9798077..e3ed31f 100644 --- a/llvm/test/CodeGen/SystemZ/llvm.sincos.ll +++ b/llvm/test/CodeGen/SystemZ/llvm.sincos.ll @@ -163,9 +163,9 @@ define { <2 x fp128>, <2 x fp128> } @test_sincos_v2f128(<2 x fp128> %a) #0 { ; LINUX-NEXT: ld %f10, 8(%r3) ; LINUX-NEXT: ld %f0, 16(%r3) ; LINUX-NEXT: ld %f2, 24(%r3) -; LINUX-NEXT: la %r3, 16(%r2) -; LINUX-NEXT: la %r4, 48(%r2) ; LINUX-NEXT: la %r2, 176(%r15) +; LINUX-NEXT: la %r3, 16(%r13) +; LINUX-NEXT: la %r4, 48(%r13) ; LINUX-NEXT: std %f0, 176(%r15) ; LINUX-NEXT: std %f2, 184(%r15) ; LINUX-NEXT: brasl %r14, sincosl@PLT diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll index 6f986ce..c418038 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/reductions.ll @@ -541,11 +541,11 @@ define dso_local arm_aapcs_vfpcc void @two_reductions_mul_add_v8i16(ptr nocaptur ; CHECK-NEXT: cbz r2, .LBB7_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph ; CHECK-NEXT: adds r3, r2, #7 -; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: bic r3, r3, #7 ; CHECK-NEXT: movs r4, #1 +; CHECK-NEXT: bic r3, r3, #7 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: subs r3, #8 -; CHECK-NEXT: vmov q3, q1 +; CHECK-NEXT: vmov.i32 q3, #0x0 ; CHECK-NEXT: add.w r12, r4, r3, lsr #3 ; CHECK-NEXT: mov r3, r0 ; CHECK-NEXT: mov r4, r1 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll index 4020709..fe06601 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/spillingmove.ll @@ -16,39 +16,40 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha(ptr noalias nocapture ; CHECK-NEXT: sub sp, #64 ; CHECK-NEXT: ldrsh.w r7, [r2] ; CHECK-NEXT: cmp r7, #1 -; CHECK-NEXT: blt.w .LBB0_6 +; CHECK-NEXT: blt .LBB0_6 ; CHECK-NEXT: @ %bb.2: @ %for.cond3.preheader.us.preheader -; CHECK-NEXT: movs r2, #252 ; CHECK-NEXT: ldr r4, [sp, #152] +; CHECK-NEXT: movs r2, #252 ; CHECK-NEXT: and.w r6, r2, r3, lsr #3 ; CHECK-NEXT: movs r2, #120 ; CHECK-NEXT: and.w r5, r2, r3, lsr #9 ; CHECK-NEXT: lsls r3, r3, #3 -; CHECK-NEXT: uxtb r3, r3 ; CHECK-NEXT: muls r6, r4, r6 +; CHECK-NEXT: uxtb r3, r3 ; CHECK-NEXT: rsb.w r2, r4, #256 -; CHECK-NEXT: vmov.i16 q2, #0xfc +; CHECK-NEXT: vmov.i16 q1, #0xfc +; CHECK-NEXT: vdup.16 q0, r6 ; CHECK-NEXT: mul lr, r5, r4 -; CHECK-NEXT: vdup.16 q4, r6 ; CHECK-NEXT: mov.w r6, #2016 -; CHECK-NEXT: vmov.i16 q6, #0xf8 +; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill ; CHECK-NEXT: mul r5, r3, r4 ; CHECK-NEXT: adds r3, r7, #7 +; CHECK-NEXT: vdup.16 q0, r6 ; CHECK-NEXT: bic r3, r3, #7 -; CHECK-NEXT: vdup.16 q3, lr +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vdup.16 q0, r5 +; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vdup.16 q0, lr ; CHECK-NEXT: subs r3, #8 +; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: movs r4, #1 -; CHECK-NEXT: vdup.16 q0, r5 -; CHECK-NEXT: lsls r1, r1, #1 +; CHECK-NEXT: vldrw.u32 q7, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: add.w r3, r4, r3, lsr #3 -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov.i16 q0, #0xf800 +; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp] @ 16-byte Reload +; CHECK-NEXT: lsls r1, r1, #1 ; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: vdup.16 q5, r6 -; CHECK-NEXT: vmov.i16 q7, #0x78 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q3, [sp, #16] @ 16-byte Spill +; CHECK-NEXT: vmov.i16 q4, #0xf8 ; CHECK-NEXT: .LBB0_3: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB0_4 Depth 2 @@ -59,37 +60,31 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha(ptr noalias nocapture ; CHECK-NEXT: @ Parent Loop BB0_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vctp.16 r6 -; CHECK-NEXT: subs r6, #8 +; CHECK-NEXT: vmov.i16 q5, #0xf800 ; CHECK-NEXT: vpst ; CHECK-NEXT: vldrht.u16 q0, [r5] -; CHECK-NEXT: vshr.u16 q1, q0, #3 -; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vmov q2, q4 -; CHECK-NEXT: vmla.i16 q2, q1, r2 -; CHECK-NEXT: vshr.u16 q1, q2, #5 -; CHECK-NEXT: vshl.i16 q2, q0, #3 -; CHECK-NEXT: vand q3, q1, q5 -; CHECK-NEXT: vmov q1, q7 -; CHECK-NEXT: vand q2, q2, q6 -; CHECK-NEXT: vmov q7, q6 -; CHECK-NEXT: vmov q6, q5 -; CHECK-NEXT: vmov q5, q4 -; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: subs r6, #8 +; CHECK-NEXT: vshr.u16 q3, q0, #3 +; CHECK-NEXT: vand q3, q3, q1 +; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmla.i16 q1, q3, r2 +; CHECK-NEXT: vshl.i16 q3, q0, #3 +; CHECK-NEXT: vand q3, q3, q4 +; CHECK-NEXT: vmov q4, q6 +; CHECK-NEXT: vshr.u16 q1, q1, #5 +; CHECK-NEXT: vmla.i16 q4, q3, r2 +; CHECK-NEXT: vshr.u16 q3, q4, #11 +; CHECK-NEXT: vand q1, q1, q7 +; CHECK-NEXT: vorr q1, q1, q3 ; CHECK-NEXT: vshr.u16 q0, q0, #9 -; CHECK-NEXT: vmla.i16 q4, q2, r2 -; CHECK-NEXT: vshr.u16 q2, q4, #11 -; CHECK-NEXT: vmov q4, q5 -; CHECK-NEXT: vmov q5, q6 -; CHECK-NEXT: vmov q6, q7 -; CHECK-NEXT: vmov q7, q1 -; CHECK-NEXT: vorr q1, q3, q2 -; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vand q0, q0, q7 -; CHECK-NEXT: vmla.i16 q2, q0, r2 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload -; CHECK-NEXT: vand q0, q2, q0 -; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vmov.i16 q3, #0x78 +; CHECK-NEXT: vmov.i16 q4, #0xf8 +; CHECK-NEXT: vand q0, q0, q3 +; CHECK-NEXT: vmov q3, q2 +; CHECK-NEXT: vmla.i16 q3, q0, r2 +; CHECK-NEXT: vand q0, q3, q5 ; CHECK-NEXT: vorr q0, q1, q0 +; CHECK-NEXT: vmov.i16 q1, #0xfc ; CHECK-NEXT: vpst ; CHECK-NEXT: vstrht.16 q0, [r5], #16 ; CHECK-NEXT: le lr, .LBB0_4 @@ -190,7 +185,7 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: ldrsh.w r12, [r2, #2] ; CHECK-NEXT: cmp.w r12, #1 -; CHECK-NEXT: blt.w .LBB1_7 +; CHECK-NEXT: blt .LBB1_7 ; CHECK-NEXT: @ %bb.1: @ %for.cond3.preheader.lr.ph ; CHECK-NEXT: ldrsh.w r2, [r2] ; CHECK-NEXT: cmp r2, #1 @@ -200,71 +195,70 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc ; CHECK-NEXT: push {r4, r5, r6, r7, lr} ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: sub sp, #80 -; CHECK-NEXT: ldr r7, [sp, #168] +; CHECK-NEXT: ldr r7, [sp, #88] ; CHECK-NEXT: movs r5, #120 ; CHECK-NEXT: lsls r6, r3, #3 ; CHECK-NEXT: movs r4, #252 ; CHECK-NEXT: and.w r5, r5, r3, lsr #9 ; CHECK-NEXT: uxtb r6, r6 ; CHECK-NEXT: and.w r3, r4, r3, lsr #3 +; CHECK-NEXT: adds r4, r2, #7 ; CHECK-NEXT: muls r6, r7, r6 +; CHECK-NEXT: bic r4, r4, #7 ; CHECK-NEXT: mul lr, r3, r7 -; CHECK-NEXT: vdup.16 q0, r6 -; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill -; CHECK-NEXT: vdup.16 q0, lr ; CHECK-NEXT: muls r5, r7, r5 -; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill -; CHECK-NEXT: vmov.i16 q0, #0xfc -; CHECK-NEXT: mov.w r6, #2016 -; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill -; CHECK-NEXT: vdup.16 q0, r5 ; CHECK-NEXT: rsb.w r3, r7, #256 ; CHECK-NEXT: lsls r7, r1, #1 -; CHECK-NEXT: vstrw.32 q0, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vdup.16 q0, r6 +; CHECK-NEXT: sub.w r1, r4, #8 +; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: vmov.i16 q2, #0xf8 -; CHECK-NEXT: vmov.i16 q5, #0x78 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill -; CHECK-NEXT: vmov.i16 q6, #0xf800 +; CHECK-NEXT: add.w r1, r4, r1, lsr #3 +; CHECK-NEXT: vdup.16 q6, r6 +; CHECK-NEXT: mov.w r6, #2016 ; CHECK-NEXT: movs r4, #0 -; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vdup.16 q3, lr +; CHECK-NEXT: vdup.16 q5, r5 +; CHECK-NEXT: vdup.16 q7, r6 ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB1_3: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB1_4 Depth 2 ; CHECK-NEXT: mov r5, r0 -; CHECK-NEXT: dlstp.16 lr, r2 +; CHECK-NEXT: mov r6, r2 +; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: .p2align 2 ; CHECK-NEXT: .LBB1_4: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB1_3 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vldrh.u16 q0, [r5] +; CHECK-NEXT: vctp.16 r6 +; CHECK-NEXT: vpst +; CHECK-NEXT: vldrht.u16 q0, [r5] ; CHECK-NEXT: vshl.i16 q1, q0, #3 -; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: subs r6, #8 ; CHECK-NEXT: vand q1, q1, q2 -; CHECK-NEXT: vmla.i16 q3, q1, r3 -; CHECK-NEXT: vmov.f64 d8, d4 -; CHECK-NEXT: vmov.f64 d9, d5 -; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vshr.u16 q2, q0, #9 +; CHECK-NEXT: vmov.i16 q2, #0x78 +; CHECK-NEXT: vshr.u16 q4, q0, #9 +; CHECK-NEXT: vand q4, q4, q2 +; CHECK-NEXT: vmov q2, q6 +; CHECK-NEXT: vmla.i16 q2, q1, r3 ; CHECK-NEXT: vshr.u16 q0, q0, #3 +; CHECK-NEXT: vmov.i16 q1, #0xfc ; CHECK-NEXT: vand q0, q0, q1 -; CHECK-NEXT: vldrw.u32 q1, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vmov q1, q3 ; CHECK-NEXT: vmla.i16 q1, q0, r3 -; CHECK-NEXT: vand q2, q2, q5 -; CHECK-NEXT: vshr.u16 q0, q3, #11 -; CHECK-NEXT: vldrw.u32 q3, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vshr.u16 q0, q2, #11 +; CHECK-NEXT: vmov q2, q5 +; CHECK-NEXT: vmla.i16 q2, q4, r3 ; CHECK-NEXT: vshr.u16 q1, q1, #5 -; CHECK-NEXT: vmla.i16 q3, q2, r3 +; CHECK-NEXT: vmov.i16 q4, #0xf800 ; CHECK-NEXT: vand q1, q1, q7 ; CHECK-NEXT: vorr q0, q1, q0 -; CHECK-NEXT: vand q1, q3, q6 +; CHECK-NEXT: vand q1, q2, q4 +; CHECK-NEXT: vmov.i16 q2, #0xf8 ; CHECK-NEXT: vorr q0, q0, q1 -; CHECK-NEXT: vstrh.16 q0, [r5], #16 -; CHECK-NEXT: vmov.f64 d4, d8 -; CHECK-NEXT: vmov.f64 d5, d9 -; CHECK-NEXT: letp lr, .LBB1_4 +; CHECK-NEXT: vpst +; CHECK-NEXT: vstrht.16 q0, [r5], #16 +; CHECK-NEXT: le lr, .LBB1_4 ; CHECK-NEXT: @ %bb.5: @ %for.cond3.for.cond.cleanup7_crit_edge.us ; CHECK-NEXT: @ in Loop: Header=BB1_3 Depth=1 ; CHECK-NEXT: adds r4, #1 @@ -272,7 +266,6 @@ define void @__arm_2d_impl_rgb16_colour_filling_with_alpha_sched(ptr noalias noc ; CHECK-NEXT: cmp r4, r12 ; CHECK-NEXT: bne .LBB1_3 ; CHECK-NEXT: @ %bb.6: -; CHECK-NEXT: add sp, #80 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, lr} diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll index 07c06e1..1769c5d 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/varying-outer-2d-reduction.ll @@ -17,17 +17,16 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ptr nocapture %Output, i16 signext %Size, i16 signext %N, i16 signext %Scale) local_unnamed_addr { ; ENABLED-LABEL: varying_outer_2d_reduction: ; ENABLED: @ %bb.0: @ %entry -; ENABLED-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; ENABLED-NEXT: sub sp, #4 ; ENABLED-NEXT: cmp r3, #1 -; ENABLED-NEXT: str r0, [sp] @ 4-byte Spill -; ENABLED-NEXT: blt .LBB0_8 -; ENABLED-NEXT: @ %bb.1: @ %for.body.lr.ph -; ENABLED-NEXT: ldr r0, [sp, #36] -; ENABLED-NEXT: add.w r12, r2, #3 -; ENABLED-NEXT: ldr.w r10, [sp] @ 4-byte Reload -; ENABLED-NEXT: mov.w r8, #0 -; ENABLED-NEXT: mov r9, r12 +; ENABLED-NEXT: it lt +; ENABLED-NEXT: bxlt lr +; ENABLED-NEXT: .LBB0_1: @ %for.body.lr.ph +; ENABLED-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr} +; ENABLED-NEXT: mov r11, r0 +; ENABLED-NEXT: ldr r0, [sp, #32] +; ENABLED-NEXT: add.w r9, r2, #3 +; ENABLED-NEXT: mov.w r12, #0 +; ENABLED-NEXT: mov r10, r11 ; ENABLED-NEXT: uxth r0, r0 ; ENABLED-NEXT: rsbs r5, r0, #0 ; ENABLED-NEXT: b .LBB0_4 @@ -37,31 +36,32 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; ENABLED-NEXT: lsrs r0, r0, #16 ; ENABLED-NEXT: sub.w r9, r9, #1 -; ENABLED-NEXT: strh.w r0, [r1, r8, lsl #1] -; ENABLED-NEXT: add.w r8, r8, #1 +; ENABLED-NEXT: strh.w r0, [r1, r12, lsl #1] +; ENABLED-NEXT: add.w r12, r12, #1 ; ENABLED-NEXT: add.w r10, r10, #2 -; ENABLED-NEXT: cmp r8, r3 +; ENABLED-NEXT: cmp r12, r3 ; ENABLED-NEXT: beq .LBB0_8 ; ENABLED-NEXT: .LBB0_4: @ %for.body ; ENABLED-NEXT: @ =>This Loop Header: Depth=1 ; ENABLED-NEXT: @ Child Loop BB0_6 Depth 2 -; ENABLED-NEXT: cmp r2, r8 +; ENABLED-NEXT: cmp r2, r12 ; ENABLED-NEXT: ble .LBB0_2 ; ENABLED-NEXT: @ %bb.5: @ %vector.ph ; ENABLED-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; ENABLED-NEXT: bic r0, r9, #3 ; ENABLED-NEXT: movs r7, #1 ; ENABLED-NEXT: subs r0, #4 -; ENABLED-NEXT: sub.w r4, r2, r8 +; ENABLED-NEXT: sub.w r4, r2, r12 ; ENABLED-NEXT: vmov.i32 q1, #0x0 ; ENABLED-NEXT: add.w r6, r7, r0, lsr #2 -; ENABLED-NEXT: sub.w r0, r12, r8 +; ENABLED-NEXT: adds r0, r2, #3 +; ENABLED-NEXT: sub.w r0, r0, r12 ; ENABLED-NEXT: bic r0, r0, #3 ; ENABLED-NEXT: subs r0, #4 ; ENABLED-NEXT: add.w r0, r7, r0, lsr #2 ; ENABLED-NEXT: mov r7, r10 ; ENABLED-NEXT: dls lr, r0 -; ENABLED-NEXT: ldr r0, [sp] @ 4-byte Reload +; ENABLED-NEXT: mov r0, r11 ; ENABLED-NEXT: .LBB0_6: @ %vector.body ; ENABLED-NEXT: @ Parent Loop BB0_4 Depth=1 ; ENABLED-NEXT: @ => This Inner Loop Header: Depth=2 @@ -82,23 +82,22 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; ENABLED-NEXT: vpsel q0, q1, q0 ; ENABLED-NEXT: vaddv.u32 r0, q0 ; ENABLED-NEXT: b .LBB0_3 -; ENABLED-NEXT: .LBB0_8: @ %for.end17 -; ENABLED-NEXT: add sp, #4 -; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; ENABLED-NEXT: .LBB0_8: +; ENABLED-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr} +; ENABLED-NEXT: bx lr ; ; NOREDUCTIONS-LABEL: varying_outer_2d_reduction: ; NOREDUCTIONS: @ %bb.0: @ %entry -; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} -; NOREDUCTIONS-NEXT: sub sp, #4 ; NOREDUCTIONS-NEXT: cmp r3, #1 -; NOREDUCTIONS-NEXT: str r0, [sp] @ 4-byte Spill -; NOREDUCTIONS-NEXT: blt .LBB0_8 -; NOREDUCTIONS-NEXT: @ %bb.1: @ %for.body.lr.ph -; NOREDUCTIONS-NEXT: ldr r0, [sp, #36] -; NOREDUCTIONS-NEXT: add.w r12, r2, #3 -; NOREDUCTIONS-NEXT: ldr.w r10, [sp] @ 4-byte Reload -; NOREDUCTIONS-NEXT: mov.w r8, #0 -; NOREDUCTIONS-NEXT: mov r9, r12 +; NOREDUCTIONS-NEXT: it lt +; NOREDUCTIONS-NEXT: bxlt lr +; NOREDUCTIONS-NEXT: .LBB0_1: @ %for.body.lr.ph +; NOREDUCTIONS-NEXT: push.w {r4, r5, r6, r7, r9, r10, r11, lr} +; NOREDUCTIONS-NEXT: mov r11, r0 +; NOREDUCTIONS-NEXT: ldr r0, [sp, #32] +; NOREDUCTIONS-NEXT: add.w r9, r2, #3 +; NOREDUCTIONS-NEXT: mov.w r12, #0 +; NOREDUCTIONS-NEXT: mov r10, r11 ; NOREDUCTIONS-NEXT: uxth r0, r0 ; NOREDUCTIONS-NEXT: rsbs r5, r0, #0 ; NOREDUCTIONS-NEXT: b .LBB0_4 @@ -108,31 +107,32 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: lsrs r0, r0, #16 ; NOREDUCTIONS-NEXT: sub.w r9, r9, #1 -; NOREDUCTIONS-NEXT: strh.w r0, [r1, r8, lsl #1] -; NOREDUCTIONS-NEXT: add.w r8, r8, #1 +; NOREDUCTIONS-NEXT: strh.w r0, [r1, r12, lsl #1] +; NOREDUCTIONS-NEXT: add.w r12, r12, #1 ; NOREDUCTIONS-NEXT: add.w r10, r10, #2 -; NOREDUCTIONS-NEXT: cmp r8, r3 +; NOREDUCTIONS-NEXT: cmp r12, r3 ; NOREDUCTIONS-NEXT: beq .LBB0_8 ; NOREDUCTIONS-NEXT: .LBB0_4: @ %for.body ; NOREDUCTIONS-NEXT: @ =>This Loop Header: Depth=1 ; NOREDUCTIONS-NEXT: @ Child Loop BB0_6 Depth 2 -; NOREDUCTIONS-NEXT: cmp r2, r8 +; NOREDUCTIONS-NEXT: cmp r2, r12 ; NOREDUCTIONS-NEXT: ble .LBB0_2 ; NOREDUCTIONS-NEXT: @ %bb.5: @ %vector.ph ; NOREDUCTIONS-NEXT: @ in Loop: Header=BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: bic r0, r9, #3 ; NOREDUCTIONS-NEXT: movs r7, #1 ; NOREDUCTIONS-NEXT: subs r0, #4 -; NOREDUCTIONS-NEXT: sub.w r4, r2, r8 +; NOREDUCTIONS-NEXT: sub.w r4, r2, r12 ; NOREDUCTIONS-NEXT: vmov.i32 q1, #0x0 ; NOREDUCTIONS-NEXT: add.w r6, r7, r0, lsr #2 -; NOREDUCTIONS-NEXT: sub.w r0, r12, r8 +; NOREDUCTIONS-NEXT: adds r0, r2, #3 +; NOREDUCTIONS-NEXT: sub.w r0, r0, r12 ; NOREDUCTIONS-NEXT: bic r0, r0, #3 ; NOREDUCTIONS-NEXT: subs r0, #4 ; NOREDUCTIONS-NEXT: add.w r0, r7, r0, lsr #2 ; NOREDUCTIONS-NEXT: mov r7, r10 ; NOREDUCTIONS-NEXT: dls lr, r0 -; NOREDUCTIONS-NEXT: ldr r0, [sp] @ 4-byte Reload +; NOREDUCTIONS-NEXT: mov r0, r11 ; NOREDUCTIONS-NEXT: .LBB0_6: @ %vector.body ; NOREDUCTIONS-NEXT: @ Parent Loop BB0_4 Depth=1 ; NOREDUCTIONS-NEXT: @ => This Inner Loop Header: Depth=2 @@ -153,9 +153,9 @@ define dso_local void @varying_outer_2d_reduction(ptr nocapture readonly %Input, ; NOREDUCTIONS-NEXT: vpsel q0, q1, q0 ; NOREDUCTIONS-NEXT: vaddv.u32 r0, q0 ; NOREDUCTIONS-NEXT: b .LBB0_3 -; NOREDUCTIONS-NEXT: .LBB0_8: @ %for.end17 -; NOREDUCTIONS-NEXT: add sp, #4 -; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, pc} +; NOREDUCTIONS-NEXT: .LBB0_8: +; NOREDUCTIONS-NEXT: pop.w {r4, r5, r6, r7, r9, r10, r11, lr} +; NOREDUCTIONS-NEXT: bx lr entry: %conv = sext i16 %N to i32 %cmp36 = icmp sgt i16 %N, 0 diff --git a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll index e0a61b1..78dc35b 100644 --- a/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll +++ b/llvm/test/CodeGen/Thumb2/LowOverheadLoops/vcmp-vpst-combination.ll @@ -49,18 +49,17 @@ define i32 @vcmp_new_vpst_combination(i32 %len, ptr nocapture readonly %arr) { ; CHECK-NEXT: cmp r0, #1 ; CHECK-NEXT: blt .LBB1_4 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: vmov.i32 q1, #0x1 +; CHECK-NEXT: vmov.i32 q0, #0x1 ; CHECK-NEXT: movs r2, #0 ; CHECK-NEXT: dlstp.32 lr, r0 ; CHECK-NEXT: .LBB1_2: @ %vector.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q2, [r1], #16 -; CHECK-NEXT: vcmp.i32 ne, q2, zr -; CHECK-NEXT: vmov q2, q0 +; CHECK-NEXT: vldrw.u32 q1, [r1], #16 +; CHECK-NEXT: vcmp.i32 ne, q1, zr +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: vpst -; CHECK-NEXT: vmovt q2, q1 -; CHECK-NEXT: vaddva.u32 r2, q2 +; CHECK-NEXT: vmovt q1, q0 +; CHECK-NEXT: vaddva.u32 r2, q1 ; CHECK-NEXT: letp lr, .LBB1_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: mov r0, r2 diff --git a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll index c8dd949..a904347 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float16regloops.ll @@ -993,10 +993,10 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, r11, lr} ; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: .pad #20 +; CHECK-NEXT: sub sp, #20 ; CHECK-NEXT: cmp r3, #8 -; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: blo.w .LBB16_12 ; CHECK-NEXT: @ %bb.1: @ %if.then ; CHECK-NEXT: lsrs.w r12, r3, #2 @@ -1016,50 +1016,48 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: subs r1, r7, #2 ; CHECK-NEXT: rsbs r7, r4, #0 -; CHECK-NEXT: str r7, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: add.w r7, r3, #16 -; CHECK-NEXT: str r4, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r4, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: str r7, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: b .LBB16_6 ; CHECK-NEXT: .LBB16_3: @ %while.end.loopexit ; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: add.w r5, r5, r0, lsl #1 +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: add.w r6, r6, r0, lsl #1 ; CHECK-NEXT: b .LBB16_5 ; CHECK-NEXT: .LBB16_4: @ %for.end ; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: wls lr, r0, .LBB16_5 ; CHECK-NEXT: b .LBB16_10 ; CHECK-NEXT: .LBB16_5: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: subs.w r12, r12, #1 ; CHECK-NEXT: vstrb.8 q0, [r2], #8 -; CHECK-NEXT: add.w r0, r5, r0, lsl #1 +; CHECK-NEXT: add.w r0, r6, r0, lsl #1 ; CHECK-NEXT: add.w r5, r0, #8 ; CHECK-NEXT: beq.w .LBB16_12 ; CHECK-NEXT: .LBB16_6: @ %while.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB16_8 Depth 2 ; CHECK-NEXT: @ Child Loop BB16_11 Depth 2 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: ldrh.w lr, [r3, #14] ; CHECK-NEXT: vldrw.u32 q0, [r0], #8 -; CHECK-NEXT: ldrh.w r8, [r3, #12] +; CHECK-NEXT: ldrh.w r10, [r3, #12] ; CHECK-NEXT: ldrh r7, [r3, #10] ; CHECK-NEXT: ldrh r4, [r3, #8] ; CHECK-NEXT: ldrh r6, [r3, #6] ; CHECK-NEXT: ldrh.w r9, [r3, #4] ; CHECK-NEXT: ldrh.w r11, [r3, #2] -; CHECK-NEXT: ldrh.w r10, [r3] +; CHECK-NEXT: ldrh.w r8, [r3] ; CHECK-NEXT: vstrb.8 q0, [r1], #8 ; CHECK-NEXT: vldrw.u32 q0, [r5] -; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: adds r0, r5, #2 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: vmul.f16 q0, q0, r10 +; CHECK-NEXT: vmul.f16 q0, q0, r8 ; CHECK-NEXT: adds r0, r5, #6 ; CHECK-NEXT: vfma.f16 q0, q1, r11 ; CHECK-NEXT: vldrw.u32 q1, [r5, #4] @@ -1068,73 +1066,73 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: add.w r0, r5, #10 ; CHECK-NEXT: vfma.f16 q0, q1, r6 ; CHECK-NEXT: vldrw.u32 q1, [r5, #8] +; CHECK-NEXT: add.w r6, r5, #16 ; CHECK-NEXT: vfma.f16 q0, q1, r4 ; CHECK-NEXT: vldrw.u32 q1, [r0] ; CHECK-NEXT: add.w r0, r5, #14 ; CHECK-NEXT: vfma.f16 q0, q1, r7 ; CHECK-NEXT: vldrw.u32 q1, [r5, #12] -; CHECK-NEXT: adds r5, #16 -; CHECK-NEXT: vfma.f16 q0, q1, r8 +; CHECK-NEXT: vfma.f16 q0, q1, r10 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: vfma.f16 q0, q1, lr ; CHECK-NEXT: cmp r0, #16 ; CHECK-NEXT: blo .LBB16_9 ; CHECK-NEXT: @ %bb.7: @ %for.body.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 ; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload +; CHECK-NEXT: add.w r5, r3, #16 ; CHECK-NEXT: dls lr, r0 -; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: .LBB16_8: @ %for.body ; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldrh r0, [r6], #16 -; CHECK-NEXT: vldrw.u32 q1, [r5] -; CHECK-NEXT: adds r4, r5, #2 +; CHECK-NEXT: ldrh r0, [r5], #16 +; CHECK-NEXT: vldrw.u32 q1, [r6] +; CHECK-NEXT: adds r4, r6, #2 ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: vldrw.u32 q1, [r4] -; CHECK-NEXT: ldrh r0, [r6, #-14] -; CHECK-NEXT: adds r4, r5, #6 +; CHECK-NEXT: ldrh r0, [r5, #-14] +; CHECK-NEXT: adds r4, r6, #6 ; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: ldrh r0, [r6, #-12] -; CHECK-NEXT: vldrw.u32 q1, [r5, #4] +; CHECK-NEXT: ldrh r0, [r5, #-12] +; CHECK-NEXT: vldrw.u32 q1, [r6, #4] ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: vldrw.u32 q1, [r4] -; CHECK-NEXT: ldrh r0, [r6, #-10] -; CHECK-NEXT: add.w r4, r5, #10 +; CHECK-NEXT: ldrh r0, [r5, #-10] +; CHECK-NEXT: add.w r4, r6, #10 ; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: ldrh r0, [r6, #-8] -; CHECK-NEXT: vldrw.u32 q1, [r5, #8] +; CHECK-NEXT: ldrh r0, [r5, #-8] +; CHECK-NEXT: vldrw.u32 q1, [r6, #8] ; CHECK-NEXT: vfma.f16 q0, q1, r0 ; CHECK-NEXT: vldrw.u32 q1, [r4] -; CHECK-NEXT: ldrh r0, [r6, #-6] -; CHECK-NEXT: ldrh r4, [r6, #-2] +; CHECK-NEXT: ldrh r0, [r5, #-6] +; CHECK-NEXT: ldrh r4, [r5, #-2] ; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: ldrh r0, [r6, #-4] -; CHECK-NEXT: vldrw.u32 q1, [r5, #12] +; CHECK-NEXT: ldrh r0, [r5, #-4] +; CHECK-NEXT: vldrw.u32 q1, [r6, #12] ; CHECK-NEXT: vfma.f16 q0, q1, r0 -; CHECK-NEXT: add.w r0, r5, #14 +; CHECK-NEXT: add.w r0, r6, #14 ; CHECK-NEXT: vldrw.u32 q1, [r0] -; CHECK-NEXT: adds r5, #16 +; CHECK-NEXT: adds r6, #16 ; CHECK-NEXT: vfma.f16 q0, q1, r4 ; CHECK-NEXT: le lr, .LBB16_8 ; CHECK-NEXT: b .LBB16_4 ; CHECK-NEXT: .LBB16_9: @ in Loop: Header=BB16_6 Depth=1 -; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: add.w r5, r3, #16 ; CHECK-NEXT: b .LBB16_4 ; CHECK-NEXT: .LBB16_10: @ %while.body76.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 -; CHECK-NEXT: mov r0, r5 +; CHECK-NEXT: mov r0, r6 ; CHECK-NEXT: .LBB16_11: @ %while.body76 ; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldrh r4, [r6], #2 +; CHECK-NEXT: ldrh r4, [r5], #2 ; CHECK-NEXT: vldrh.u16 q1, [r0], #2 ; CHECK-NEXT: vfma.f16 q0, q1, r4 ; CHECK-NEXT: le lr, .LBB16_11 ; CHECK-NEXT: b .LBB16_3 ; CHECK-NEXT: .LBB16_12: @ %if.end -; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: add sp, #20 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} entry: %pState1 = getelementptr inbounds %struct.arm_fir_instance_f32, ptr %S, i32 0, i32 1 diff --git a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll index 28166e4..f7b4548 100644 --- a/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll +++ b/llvm/test/CodeGen/Thumb2/mve-float32regloops.ll @@ -995,46 +995,44 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 ; CHECK-NEXT: ldrh r6, [r0] -; CHECK-NEXT: movs r5, #1 -; CHECK-NEXT: ldrd r4, r10, [r0, #4] +; CHECK-NEXT: movs r4, #1 +; CHECK-NEXT: ldrd r7, r10, [r0, #4] ; CHECK-NEXT: sub.w r0, r6, #8 ; CHECK-NEXT: add.w r3, r0, r0, lsr #29 ; CHECK-NEXT: and r0, r0, #7 -; CHECK-NEXT: asrs r7, r3, #3 -; CHECK-NEXT: cmp r7, #1 +; CHECK-NEXT: asrs r5, r3, #3 +; CHECK-NEXT: cmp r5, #1 ; CHECK-NEXT: it gt -; CHECK-NEXT: asrgt r5, r3, #3 -; CHECK-NEXT: add.w r3, r4, r6, lsl #2 +; CHECK-NEXT: asrgt r4, r3, #3 +; CHECK-NEXT: add.w r3, r7, r6, lsl #2 ; CHECK-NEXT: sub.w r9, r3, #4 ; CHECK-NEXT: rsbs r3, r6, #0 -; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill -; CHECK-NEXT: add.w r3, r10, #32 -; CHECK-NEXT: str r5, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: str r6, [sp, #16] @ 4-byte Spill -; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str r4, [sp] @ 4-byte Spill +; CHECK-NEXT: str r6, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: str r0, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: b .LBB16_6 ; CHECK-NEXT: .LBB16_3: @ %while.end.loopexit ; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: add.w r4, r4, r0, lsl #2 +; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: add.w r7, r7, r0, lsl #2 ; CHECK-NEXT: b .LBB16_5 ; CHECK-NEXT: .LBB16_4: @ %for.end ; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 -; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload -; CHECK-NEXT: ldrd r0, r9, [sp, #20] @ 8-byte Folded Reload +; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldrd r0, r9, [sp, #12] @ 8-byte Folded Reload ; CHECK-NEXT: wls lr, r0, .LBB16_5 ; CHECK-NEXT: b .LBB16_10 ; CHECK-NEXT: .LBB16_5: @ %while.end ; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: subs.w r12, r12, #1 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 -; CHECK-NEXT: add.w r0, r4, r0, lsl #2 -; CHECK-NEXT: add.w r4, r0, #16 +; CHECK-NEXT: add.w r0, r7, r0, lsl #2 +; CHECK-NEXT: add.w r7, r0, #16 ; CHECK-NEXT: beq .LBB16_12 ; CHECK-NEXT: .LBB16_6: @ %while.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 @@ -1042,76 +1040,76 @@ define void @fir(ptr nocapture readonly %S, ptr nocapture readonly %pSrc, ptr no ; CHECK-NEXT: @ Child Loop BB16_11 Depth 2 ; CHECK-NEXT: add.w lr, r10, #8 ; CHECK-NEXT: vldrw.u32 q0, [r1], #16 -; CHECK-NEXT: ldrd r3, r7, [r10] +; CHECK-NEXT: ldrd r3, r4, [r10] ; CHECK-NEXT: ldm.w lr, {r0, r5, r6, lr} ; CHECK-NEXT: ldrd r11, r8, [r10, #24] ; CHECK-NEXT: vstrb.8 q0, [r9], #16 -; CHECK-NEXT: vldrw.u32 q0, [r4], #32 -; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: str.w r9, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [r4, #-28] +; CHECK-NEXT: vldrw.u32 q0, [r7], #32 +; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str.w r9, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [r7, #-28] ; CHECK-NEXT: vmul.f32 q0, q0, r3 -; CHECK-NEXT: vldrw.u32 q6, [r4, #-24] -; CHECK-NEXT: vldrw.u32 q4, [r4, #-20] -; CHECK-NEXT: vfma.f32 q0, q1, r7 -; CHECK-NEXT: vldrw.u32 q5, [r4, #-16] +; CHECK-NEXT: vldrw.u32 q6, [r7, #-24] +; CHECK-NEXT: vldrw.u32 q4, [r7, #-20] +; CHECK-NEXT: vfma.f32 q0, q1, r4 +; CHECK-NEXT: vldrw.u32 q5, [r7, #-16] ; CHECK-NEXT: vfma.f32 q0, q6, r0 -; CHECK-NEXT: vldrw.u32 q2, [r4, #-12] +; CHECK-NEXT: vldrw.u32 q2, [r7, #-12] ; CHECK-NEXT: vfma.f32 q0, q4, r5 -; CHECK-NEXT: vldrw.u32 q3, [r4, #-8] +; CHECK-NEXT: vldrw.u32 q3, [r7, #-8] ; CHECK-NEXT: vfma.f32 q0, q5, r6 -; CHECK-NEXT: vldrw.u32 q1, [r4, #-4] +; CHECK-NEXT: vldrw.u32 q1, [r7, #-4] ; CHECK-NEXT: vfma.f32 q0, q2, lr -; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: vfma.f32 q0, q3, r11 ; CHECK-NEXT: vfma.f32 q0, q1, r8 ; CHECK-NEXT: cmp r0, #16 ; CHECK-NEXT: blo .LBB16_9 ; CHECK-NEXT: @ %bb.7: @ %for.body.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 -; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload +; CHECK-NEXT: add.w r4, r10, #32 ; CHECK-NEXT: dls lr, r0 -; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: .LBB16_8: @ %for.body ; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldm.w r7, {r0, r3, r5, r6, r8, r11} -; CHECK-NEXT: vldrw.u32 q1, [r4], #32 -; CHECK-NEXT: vldrw.u32 q6, [r4, #-24] -; CHECK-NEXT: vldrw.u32 q4, [r4, #-20] +; CHECK-NEXT: ldm.w r4, {r0, r3, r5, r6, r8, r11} +; CHECK-NEXT: vldrw.u32 q1, [r7], #32 +; CHECK-NEXT: vldrw.u32 q6, [r7, #-24] +; CHECK-NEXT: vldrw.u32 q4, [r7, #-20] ; CHECK-NEXT: vfma.f32 q0, q1, r0 -; CHECK-NEXT: vldrw.u32 q1, [r4, #-28] -; CHECK-NEXT: vldrw.u32 q5, [r4, #-16] -; CHECK-NEXT: vldrw.u32 q2, [r4, #-12] +; CHECK-NEXT: vldrw.u32 q1, [r7, #-28] +; CHECK-NEXT: vldrw.u32 q5, [r7, #-16] +; CHECK-NEXT: vldrw.u32 q2, [r7, #-12] ; CHECK-NEXT: vfma.f32 q0, q1, r3 -; CHECK-NEXT: ldrd r9, r1, [r7, #24] +; CHECK-NEXT: ldrd r9, r1, [r4, #24] ; CHECK-NEXT: vfma.f32 q0, q6, r5 -; CHECK-NEXT: vldrw.u32 q3, [r4, #-8] +; CHECK-NEXT: vldrw.u32 q3, [r7, #-8] ; CHECK-NEXT: vfma.f32 q0, q4, r6 -; CHECK-NEXT: vldrw.u32 q1, [r4, #-4] +; CHECK-NEXT: vldrw.u32 q1, [r7, #-4] ; CHECK-NEXT: vfma.f32 q0, q5, r8 -; CHECK-NEXT: adds r7, #32 +; CHECK-NEXT: adds r4, #32 ; CHECK-NEXT: vfma.f32 q0, q2, r11 ; CHECK-NEXT: vfma.f32 q0, q3, r9 ; CHECK-NEXT: vfma.f32 q0, q1, r1 ; CHECK-NEXT: le lr, .LBB16_8 ; CHECK-NEXT: b .LBB16_4 ; CHECK-NEXT: .LBB16_9: @ in Loop: Header=BB16_6 Depth=1 -; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: add.w r4, r10, #32 ; CHECK-NEXT: b .LBB16_4 ; CHECK-NEXT: .LBB16_10: @ %while.body76.preheader ; CHECK-NEXT: @ in Loop: Header=BB16_6 Depth=1 -; CHECK-NEXT: mov r3, r4 +; CHECK-NEXT: mov r3, r7 ; CHECK-NEXT: .LBB16_11: @ %while.body76 ; CHECK-NEXT: @ Parent Loop BB16_6 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: ldr r0, [r7], #4 +; CHECK-NEXT: ldr r0, [r4], #4 ; CHECK-NEXT: vldrw.u32 q1, [r3], #4 ; CHECK-NEXT: vfma.f32 q0, q1, r0 ; CHECK-NEXT: le lr, .LBB16_11 ; CHECK-NEXT: b .LBB16_3 ; CHECK-NEXT: .LBB16_12: -; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, lr} diff --git a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll index e8b49c1..0d86f22 100644 --- a/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-gather-increment.ll @@ -711,8 +711,8 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #136 -; CHECK-NEXT: sub sp, #136 +; CHECK-NEXT: .pad #120 +; CHECK-NEXT: sub sp, #120 ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: strd r1, r2, [sp, #64] @ 8-byte Folded Spill ; CHECK-NEXT: blt.w .LBB14_5 @@ -725,22 +725,20 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill ; CHECK-NEXT: subs r1, #8 ; CHECK-NEXT: vstrw.32 q0, [sp, #40] @ 16-byte Spill -; CHECK-NEXT: vmov.i16 q2, #0x18 ; CHECK-NEXT: add.w r1, r2, r1, lsr #3 ; CHECK-NEXT: str r1, [sp, #60] @ 4-byte Spill ; CHECK-NEXT: adr r1, .LCPI14_0 ; CHECK-NEXT: adr r2, .LCPI14_1 ; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill ; CHECK-NEXT: vstrw.32 q0, [sp, #24] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r2] -; CHECK-NEXT: add r2, sp, #120 +; CHECK-NEXT: add r2, sp, #104 ; CHECK-NEXT: vstrw.32 q0, [sp, #8] @ 16-byte Spill ; CHECK-NEXT: .LBB14_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB14_3 Depth 2 ; CHECK-NEXT: ldr r1, [sp, #60] @ 4-byte Reload -; CHECK-NEXT: add.w r10, sp, #104 +; CHECK-NEXT: add.w r10, sp, #88 ; CHECK-NEXT: dls lr, r1 ; CHECK-NEXT: ldr r7, [sp, #64] @ 4-byte Reload ; CHECK-NEXT: vldrw.u32 q4, [sp, #24] @ 16-byte Reload @@ -762,7 +760,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read ; CHECK-NEXT: vmov r6, r2, d4 ; CHECK-NEXT: ldrh r1, [r1] ; CHECK-NEXT: ldrh.w r12, [r4] -; CHECK-NEXT: add r4, sp, #88 +; CHECK-NEXT: add r4, sp, #72 ; CHECK-NEXT: ldrh.w r11, [r5] ; CHECK-NEXT: ldrh r3, [r3] ; CHECK-NEXT: ldrh r5, [r6] @@ -807,7 +805,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read ; CHECK-NEXT: vmov.16 q3[0], r2 ; CHECK-NEXT: vmov.16 q3[1], r5 ; CHECK-NEXT: vmov r2, r5, d5 -; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload +; CHECK-NEXT: vmov.i16 q2, #0x18 ; CHECK-NEXT: vadd.i16 q6, q6, q2 ; CHECK-NEXT: vadd.i16 q5, q5, q2 ; CHECK-NEXT: vadd.i16 q4, q4, q2 @@ -849,7 +847,7 @@ define arm_aapcs_vfpcc void @gather_inc_v8i16_complex(ptr noalias nocapture read ; CHECK-NEXT: cmp r1, r3 ; CHECK-NEXT: bne.w .LBB14_2 ; CHECK-NEXT: .LBB14_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #136 +; CHECK-NEXT: add sp, #120 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -950,7 +948,6 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read ; CHECK-NEXT: vstrw.32 q0, [sp, #80] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r6] ; CHECK-NEXT: adr r6, .LCPI15_9 -; CHECK-NEXT: vmov.i32 q2, #0x30 ; CHECK-NEXT: vstrw.32 q0, [sp, #64] @ 16-byte Spill ; CHECK-NEXT: vldrw.u32 q0, [r7] ; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill @@ -963,212 +960,213 @@ define arm_aapcs_vfpcc void @gather_inc_v16i8_complex(ptr noalias nocapture read ; CHECK-NEXT: .LBB15_2: @ %vector.ph ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB15_3 Depth 2 +; CHECK-NEXT: vldrw.u32 q2, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: adr r1, .LCPI15_3 -; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q0, [r1] -; CHECK-NEXT: adr r1, .LCPI15_4 ; CHECK-NEXT: vldrw.u32 q5, [r1] +; CHECK-NEXT: adr r1, .LCPI15_4 +; CHECK-NEXT: vstrw.32 q2, [sp, #296] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [r1] ; CHECK-NEXT: adr r1, .LCPI15_2 -; CHECK-NEXT: vldrw.u32 q3, [r1] +; CHECK-NEXT: vstrw.32 q2, [sp, #280] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q1, [r1] ; CHECK-NEXT: adr r1, .LCPI15_10 -; CHECK-NEXT: vstrw.32 q6, [sp, #280] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [sp, #32] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q2, [sp, #264] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #80] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q3, [r1] ; CHECK-NEXT: adr r1, .LCPI15_11 ; CHECK-NEXT: ldr.w r8, [sp, #116] @ 4-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #248] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #80] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q6, [sp, #264] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q6, [sp, #48] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q2, [sp, #248] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #64] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q0, [sp, #96] @ 16-byte Reload ; CHECK-NEXT: vldrw.u32 q7, [r1] -; CHECK-NEXT: vldrw.u32 q1, [sp] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q3, [sp, #200] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #96] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload ; CHECK-NEXT: mov r11, r10 -; CHECK-NEXT: vstrw.32 q6, [sp, #232] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q3, [sp, #184] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q2, [sp, #232] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q0, [sp, #216] @ 16-byte Spill ; CHECK-NEXT: .LBB15_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB15_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: vadd.i32 q4, q1, r0 -; CHECK-NEXT: vstrw.32 q7, [sp, #136] @ 16-byte Spill -; CHECK-NEXT: vmov r1, lr, d8 -; CHECK-NEXT: vadd.i32 q7, q7, r0 -; CHECK-NEXT: vmov r5, r4, d15 -; CHECK-NEXT: vadd.i32 q6, q0, r0 -; CHECK-NEXT: vmov r6, r7, d13 +; CHECK-NEXT: vmov q0, q7 +; CHECK-NEXT: vstrw.32 q7, [sp, #184] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q7, q5, r0 +; CHECK-NEXT: vstrw.32 q5, [sp, #200] @ 16-byte Spill +; CHECK-NEXT: vadd.i32 q5, q0, r0 +; CHECK-NEXT: vmov q0, q6 +; CHECK-NEXT: vadd.i32 q6, q4, r0 +; CHECK-NEXT: vmov r5, r4, d11 +; CHECK-NEXT: vmov r1, lr, d12 +; CHECK-NEXT: vadd.i32 q2, q1, r0 +; CHECK-NEXT: vmov r6, r7, d15 ; CHECK-NEXT: vstrw.32 q1, [sp, #152] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #296] @ 16-byte Reload -; CHECK-NEXT: vstrw.32 q0, [sp, #168] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q0, [sp, #248] @ 16-byte Reload -; CHECK-NEXT: vldrw.u32 q3, [sp, #216] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q1, q1, r0 -; CHECK-NEXT: vstrw.32 q5, [sp, #120] @ 16-byte Spill -; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov q1, q3 +; CHECK-NEXT: vstrw.32 q4, [sp, #168] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q4, [sp, #248] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [sp, #120] @ 16-byte Spill +; CHECK-NEXT: vstrw.32 q3, [sp, #136] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q3, [sp, #184] @ 16-byte Reload ; CHECK-NEXT: subs.w r11, r11, #16 -; CHECK-NEXT: ldrb.w r9, [r1] -; CHECK-NEXT: vmov r1, r3, d14 ; CHECK-NEXT: ldrb r5, [r5] +; CHECK-NEXT: ldrb.w r9, [r1] +; CHECK-NEXT: vmov r1, r3, d10 ; CHECK-NEXT: ldrb r7, [r7] ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q7[0], r1 +; CHECK-NEXT: vmov.8 q5[0], r1 ; CHECK-NEXT: ldrb r1, [r3] -; CHECK-NEXT: vmov.8 q7[1], r1 -; CHECK-NEXT: vmov r1, r3, d12 -; CHECK-NEXT: vmov.8 q7[2], r5 +; CHECK-NEXT: vmov.8 q5[1], r1 +; CHECK-NEXT: vmov r1, r3, d14 +; CHECK-NEXT: vmov.8 q5[2], r5 ; CHECK-NEXT: ldrb r5, [r6] ; CHECK-NEXT: ldrb r6, [r4] -; CHECK-NEXT: vmov.8 q7[3], r6 +; CHECK-NEXT: vmov.8 q5[3], r6 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q6[0], r1 -; CHECK-NEXT: vmov r6, r1, d2 -; CHECK-NEXT: vmov.8 q6[1], r3 -; CHECK-NEXT: vmov.8 q6[2], r5 -; CHECK-NEXT: vmov.8 q6[3], r7 +; CHECK-NEXT: vmov.8 q7[0], r1 +; CHECK-NEXT: vmov r6, r1, d4 +; CHECK-NEXT: vmov.8 q7[1], r3 +; CHECK-NEXT: vmov.8 q7[2], r5 +; CHECK-NEXT: vmov.8 q7[3], r7 ; CHECK-NEXT: ldrb.w r7, [lr] -; CHECK-NEXT: vmov.8 q6[4], r9 -; CHECK-NEXT: vmov.8 q6[5], r7 +; CHECK-NEXT: vmov.8 q7[4], r9 +; CHECK-NEXT: vmov.8 q7[5], r7 ; CHECK-NEXT: ldrb r4, [r1] -; CHECK-NEXT: vmov r1, r5, d3 -; CHECK-NEXT: vldrw.u32 q1, [sp, #232] @ 16-byte Reload +; CHECK-NEXT: vmov r1, r5, d5 +; CHECK-NEXT: vadd.i32 q2, q1, r0 +; CHECK-NEXT: vldrw.u32 q1, [sp, #280] @ 16-byte Reload ; CHECK-NEXT: ldrb.w r12, [r1] -; CHECK-NEXT: vmov r1, r3, d9 +; CHECK-NEXT: vmov r1, r3, d13 ; CHECK-NEXT: ldrb r5, [r5] -; CHECK-NEXT: vldrw.u32 q4, [sp, #184] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #232] @ 16-byte Reload ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q6[6], r1 -; CHECK-NEXT: vmov r1, r7, d0 -; CHECK-NEXT: vmov.8 q6[7], r3 +; CHECK-NEXT: vmov.8 q7[6], r1 +; CHECK-NEXT: vmov r1, r7, d4 +; CHECK-NEXT: vmov.8 q7[7], r3 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: ldrb r7, [r7] -; CHECK-NEXT: vmov.8 q7[4], r1 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #264] @ 16-byte Reload -; CHECK-NEXT: vmov.8 q7[5], r7 -; CHECK-NEXT: vadd.i32 q0, q0, r0 +; CHECK-NEXT: vmov.8 q5[4], r1 +; CHECK-NEXT: vmov r1, r3, d5 +; CHECK-NEXT: vmov.8 q5[5], r7 +; CHECK-NEXT: vadd.i32 q2, q1, r0 +; CHECK-NEXT: vldrw.u32 q1, [sp, #296] @ 16-byte Reload ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: ldrb r3, [r3] -; CHECK-NEXT: vmov.8 q7[6], r1 +; CHECK-NEXT: vmov.8 q5[6], r1 ; CHECK-NEXT: ldrb r1, [r6] -; CHECK-NEXT: vmov r7, r6, d0 -; CHECK-NEXT: vmov.8 q7[7], r3 -; CHECK-NEXT: vmov r3, lr, d1 -; CHECK-NEXT: vldrw.u32 q0, [sp, #280] @ 16-byte Reload -; CHECK-NEXT: vmov.8 q7[8], r1 -; CHECK-NEXT: vadd.i32 q0, q0, r0 -; CHECK-NEXT: vmov.8 q7[9], r4 -; CHECK-NEXT: vmov r4, r1, d0 -; CHECK-NEXT: vmov.8 q7[10], r12 -; CHECK-NEXT: vmov.8 q7[11], r5 +; CHECK-NEXT: vmov.8 q5[7], r3 +; CHECK-NEXT: vmov r7, r6, d4 +; CHECK-NEXT: vmov r3, lr, d5 +; CHECK-NEXT: vmov.8 q5[8], r1 +; CHECK-NEXT: vadd.i32 q2, q1, r0 +; CHECK-NEXT: vmov.8 q5[9], r4 +; CHECK-NEXT: vmov r4, r1, d4 +; CHECK-NEXT: vmov.8 q5[10], r12 +; CHECK-NEXT: vmov.8 q5[11], r5 +; CHECK-NEXT: vldrw.u32 q1, [sp, #264] @ 16-byte Reload ; CHECK-NEXT: ldrb r7, [r7] ; CHECK-NEXT: ldrb r6, [r6] ; CHECK-NEXT: ldrb r3, [r3] ; CHECK-NEXT: ldrb r4, [r4] ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q6[8], r4 -; CHECK-NEXT: vmov r5, r4, d1 -; CHECK-NEXT: vmov.8 q6[9], r1 -; CHECK-NEXT: vadd.i32 q0, q5, r0 -; CHECK-NEXT: vldrw.u32 q5, [sp, #200] @ 16-byte Reload +; CHECK-NEXT: vmov.8 q7[8], r4 +; CHECK-NEXT: vmov r5, r4, d5 +; CHECK-NEXT: vmov.8 q7[9], r1 +; CHECK-NEXT: vadd.i32 q2, q0, r0 +; CHECK-NEXT: vldrw.u32 q0, [sp, #216] @ 16-byte Reload ; CHECK-NEXT: ldrb r5, [r5] ; CHECK-NEXT: ldrb r4, [r4] -; CHECK-NEXT: vmov.8 q6[10], r5 -; CHECK-NEXT: vmov.8 q6[11], r4 -; CHECK-NEXT: vmov.8 q6[12], r7 -; CHECK-NEXT: vmov.8 q6[13], r6 -; CHECK-NEXT: vmov.8 q6[14], r3 -; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vmov.8 q7[10], r5 +; CHECK-NEXT: vmov.8 q7[11], r4 +; CHECK-NEXT: vmov.8 q7[12], r7 +; CHECK-NEXT: vmov.8 q7[13], r6 +; CHECK-NEXT: vmov.8 q7[14], r3 +; CHECK-NEXT: vmov r1, r3, d4 ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q7[12], r1 +; CHECK-NEXT: vmov.8 q5[12], r1 ; CHECK-NEXT: ldrb r1, [r3] -; CHECK-NEXT: vmov.8 q7[13], r1 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vadd.i32 q0, q1, r0 -; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vstrw.32 q1, [sp, #232] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #248] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q1, q1, q2 -; CHECK-NEXT: vstrw.32 q1, [sp, #248] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q1, [sp, #152] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vmov.8 q5[13], r1 +; CHECK-NEXT: vmov r1, r3, d5 +; CHECK-NEXT: vadd.i32 q2, q1, r0 ; CHECK-NEXT: ldrb r1, [r1] -; CHECK-NEXT: vmov.8 q7[14], r1 +; CHECK-NEXT: vmov.8 q5[14], r1 ; CHECK-NEXT: ldrb r1, [r3] -; CHECK-NEXT: vmov.8 q7[15], r1 +; CHECK-NEXT: vmov.8 q5[15], r1 ; CHECK-NEXT: ldrb.w r1, [lr] -; CHECK-NEXT: vmov.8 q6[15], r1 -; CHECK-NEXT: vmov r1, r3, d0 -; CHECK-NEXT: vadd.i8 q6, q6, q7 +; CHECK-NEXT: vmov.8 q7[15], r1 +; CHECK-NEXT: vmov r1, r3, d4 +; CHECK-NEXT: vadd.i8 q5, q7, q5 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: ldrb r3, [r3] ; CHECK-NEXT: vmov.8 q7[0], r1 ; CHECK-NEXT: vmov.8 q7[1], r3 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vadd.i32 q0, q3, r0 -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #216] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #296] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #296] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #280] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #280] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q3, [sp, #264] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q3, q3, q2 -; CHECK-NEXT: vstrw.32 q3, [sp, #264] @ 16-byte Spill +; CHECK-NEXT: vmov r1, r3, d5 +; CHECK-NEXT: vadd.i32 q2, q4, r0 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[2], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[3], r1 -; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vmov r1, r3, d4 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[4], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[5], r1 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vadd.i32 q0, q5, r0 -; CHECK-NEXT: vadd.i32 q5, q5, q2 -; CHECK-NEXT: vstrw.32 q5, [sp, #200] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q5, [sp, #120] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q5, q5, q2 +; CHECK-NEXT: vmov r1, r3, d5 +; CHECK-NEXT: vadd.i32 q2, q6, r0 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[6], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[7], r1 -; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vmov r1, r3, d4 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[8], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[9], r1 -; CHECK-NEXT: vmov r1, r3, d1 -; CHECK-NEXT: vadd.i32 q0, q4, r0 -; CHECK-NEXT: vadd.i32 q4, q4, q2 -; CHECK-NEXT: vstrw.32 q4, [sp, #184] @ 16-byte Spill +; CHECK-NEXT: vmov r1, r3, d5 +; CHECK-NEXT: vadd.i32 q2, q0, r0 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[10], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[11], r1 -; CHECK-NEXT: vmov r1, r3, d0 +; CHECK-NEXT: vmov r1, r3, d4 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[12], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[13], r1 -; CHECK-NEXT: vmov r1, r3, d1 +; CHECK-NEXT: vmov r1, r3, d5 ; CHECK-NEXT: ldrb r1, [r1] ; CHECK-NEXT: vmov.8 q7[14], r1 ; CHECK-NEXT: ldrb r1, [r3] ; CHECK-NEXT: vmov.8 q7[15], r1 -; CHECK-NEXT: vadd.i8 q0, q6, q7 -; CHECK-NEXT: vldrw.u32 q7, [sp, #136] @ 16-byte Reload -; CHECK-NEXT: vstrb.8 q0, [r8], #16 -; CHECK-NEXT: vldrw.u32 q0, [sp, #168] @ 16-byte Reload -; CHECK-NEXT: vadd.i32 q7, q7, q2 +; CHECK-NEXT: vadd.i8 q2, q5, q7 +; CHECK-NEXT: vldrw.u32 q5, [sp, #200] @ 16-byte Reload +; CHECK-NEXT: vstrb.8 q2, [r8], #16 +; CHECK-NEXT: vmov.i32 q2, #0x30 +; CHECK-NEXT: vadd.i32 q6, q6, q2 +; CHECK-NEXT: vadd.i32 q3, q3, q2 +; CHECK-NEXT: vstrw.32 q6, [sp, #232] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q6, [sp, #296] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vadd.i32 q4, q4, q2 +; CHECK-NEXT: vadd.i32 q6, q6, q2 ; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vmov q7, q3 +; CHECK-NEXT: vldrw.u32 q3, [sp, #136] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q1, [sp, #264] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q1, [sp, #152] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q4, [sp, #248] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q4, [sp, #168] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q6, [sp, #296] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q6, [sp, #120] @ 16-byte Reload +; CHECK-NEXT: vstrw.32 q0, [sp, #216] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q0, [sp, #280] @ 16-byte Reload +; CHECK-NEXT: vadd.i32 q5, q5, q2 +; CHECK-NEXT: vadd.i32 q3, q3, q2 +; CHECK-NEXT: vadd.i32 q0, q0, q2 +; CHECK-NEXT: vadd.i32 q4, q4, q2 +; CHECK-NEXT: vadd.i32 q1, q1, q2 +; CHECK-NEXT: vadd.i32 q6, q6, q2 +; CHECK-NEXT: vstrw.32 q0, [sp, #280] @ 16-byte Spill ; CHECK-NEXT: bne.w .LBB15_3 ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB15_2 Depth=1 @@ -1501,14 +1499,14 @@ define void @shlor(ptr nocapture %x, ptr noalias nocapture readonly %y, i32 %n) ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: blt .LBB18_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: adr.w lr, .LCPI18_0 +; CHECK-NEXT: adr r3, .LCPI18_0 ; CHECK-NEXT: adr r4, .LCPI18_1 ; CHECK-NEXT: adr r5, .LCPI18_2 ; CHECK-NEXT: adr r6, .LCPI18_3 ; CHECK-NEXT: vldrw.u32 q0, [r6] ; CHECK-NEXT: vldrw.u32 q1, [r5] ; CHECK-NEXT: vldrw.u32 q2, [r4] -; CHECK-NEXT: vldrw.u32 q3, [lr] +; CHECK-NEXT: vldrw.u32 q3, [r3] ; CHECK-NEXT: vadd.i32 q0, q0, r1 ; CHECK-NEXT: vadd.i32 q1, q1, r1 ; CHECK-NEXT: vadd.i32 q2, q2, r1 diff --git a/llvm/test/CodeGen/Thumb2/mve-phireg.ll b/llvm/test/CodeGen/Thumb2/mve-phireg.ll index dad856c..00a998c 100644 --- a/llvm/test/CodeGen/Thumb2/mve-phireg.ll +++ b/llvm/test/CodeGen/Thumb2/mve-phireg.ll @@ -38,7 +38,7 @@ define arm_aapcs_vfpcc void @k() { ; CHECK-NEXT: vmov.i32 q5, #0x0 ; CHECK-NEXT: vpsel q6, q4, q3 ; CHECK-NEXT: vstrh.16 q6, [r0] -; CHECK-NEXT: vmov q6, q5 +; CHECK-NEXT: vmov.i32 q6, #0x0 ; CHECK-NEXT: cbz r1, .LBB0_2 ; CHECK-NEXT: le .LBB0_1 ; CHECK-NEXT: .LBB0_2: @ %for.cond4.preheader @@ -135,12 +135,12 @@ vector.body115: ; preds = %vector.body115, %ve define dso_local i32 @e() #0 { ; CHECK-LABEL: e: ; CHECK: @ %bb.0: @ %entry -; CHECK-NEXT: .save {r4, r5, r6, r7, r8, r9, r10, lr} -; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, r9, r10, lr} +; CHECK-NEXT: .save {r4, r5, r6, r7, r8, lr} +; CHECK-NEXT: push.w {r4, r5, r6, r7, r8, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #408 -; CHECK-NEXT: sub sp, #408 +; CHECK-NEXT: .pad #392 +; CHECK-NEXT: sub sp, #392 ; CHECK-NEXT: movw r7, :lower16:.L_MergedGlobals ; CHECK-NEXT: vldr s15, .LCPI1_1 ; CHECK-NEXT: movt r7, :upper16:.L_MergedGlobals @@ -148,18 +148,16 @@ define dso_local i32 @e() #0 { ; CHECK-NEXT: mov r4, r7 ; CHECK-NEXT: mov r3, r7 ; CHECK-NEXT: ldr r6, [r4, #8]! -; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: ldr r0, [r3, #4]! -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill ; CHECK-NEXT: movt r2, :upper16:e +; CHECK-NEXT: ldr r0, [r3, #4]! ; CHECK-NEXT: vmov r5, s15 ; CHECK-NEXT: vmov q0[2], q0[0], r4, r4 -; CHECK-NEXT: vmov s13, r3 ; CHECK-NEXT: vldr s12, .LCPI1_0 +; CHECK-NEXT: vmov s13, r3 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r2 ; CHECK-NEXT: vdup.32 q7, r3 ; CHECK-NEXT: vmov q6[2], q6[0], r3, r5 -; CHECK-NEXT: vstrw.32 q0, [sp, #92] +; CHECK-NEXT: vstrw.32 q0, [sp, #76] ; CHECK-NEXT: vmov q0, q7 ; CHECK-NEXT: vmov q6[3], q6[1], r3, r2 ; CHECK-NEXT: vmov q4, q7 @@ -168,7 +166,7 @@ define dso_local i32 @e() #0 { ; CHECK-NEXT: vmov s21, r2 ; CHECK-NEXT: movs r1, #64 ; CHECK-NEXT: vmov.f32 s20, s12 -; CHECK-NEXT: str r0, [sp, #40] +; CHECK-NEXT: str r0, [sp, #24] ; CHECK-NEXT: vmov.f32 s22, s13 ; CHECK-NEXT: str r6, [r0] ; CHECK-NEXT: vmov.f32 s23, s15 @@ -186,12 +184,12 @@ define dso_local i32 @e() #0 { ; CHECK-NEXT: vmov q2[3], q2[1], r4, r5 ; CHECK-NEXT: vmov.32 q4[0], r8 ; CHECK-NEXT: @ implicit-def: $r2 -; CHECK-NEXT: str.w r8, [sp, #44] -; CHECK-NEXT: vstrw.32 q3, [sp, #60] -; CHECK-NEXT: strh.w r12, [sp, #406] +; CHECK-NEXT: str.w r8, [sp, #28] +; CHECK-NEXT: vstrw.32 q3, [sp, #44] +; CHECK-NEXT: strh.w r12, [sp, #390] ; CHECK-NEXT: wlstp.8 lr, r1, .LBB1_2 ; CHECK-NEXT: .LBB1_1: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vldrw.u32 q0, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: vstrb.8 q0, [r2], #16 ; CHECK-NEXT: letp lr, .LBB1_1 ; CHECK-NEXT: .LBB1_2: @ %entry @@ -199,7 +197,7 @@ define dso_local i32 @e() #0 { ; CHECK-NEXT: str.w r8, [r7] ; CHECK-NEXT: vstrw.32 q4, [r0] ; CHECK-NEXT: vstrw.32 q2, [r0] -; CHECK-NEXT: str.w r12, [sp, #324] +; CHECK-NEXT: str.w r12, [sp, #308] ; CHECK-NEXT: .LBB1_3: @ %for.cond ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 ; CHECK-NEXT: b .LBB1_3 diff --git a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll index f90af3c..2587a0bb 100644 --- a/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll +++ b/llvm/test/CodeGen/Thumb2/mve-postinc-dct.ll @@ -115,17 +115,17 @@ define void @DCT_mve2(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: movs r4, #1 ; CHECK-NEXT: ldr r3, [r0] ; CHECK-NEXT: add.w r11, r3, r12, lsl #2 -; CHECK-NEXT: add.w r7, r3, r12, lsl #3 -; CHECK-NEXT: lsl.w r9, r12, #3 +; CHECK-NEXT: add.w r6, r3, r12, lsl #3 +; CHECK-NEXT: lsl.w r10, r12, #3 ; CHECK-NEXT: .LBB1_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB1_3 Depth 2 ; CHECK-NEXT: ldr r5, [sp] @ 4-byte Reload +; CHECK-NEXT: add.w r9, r4, #1 ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: add.w r10, r4, #1 ; CHECK-NEXT: mov r3, r11 -; CHECK-NEXT: mov r0, r7 -; CHECK-NEXT: vmov q1, q0 +; CHECK-NEXT: mov r0, r6 +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB1_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB1_2 Depth=1 @@ -139,11 +139,11 @@ define void @DCT_mve2(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB1_2 Depth=1 ; CHECK-NEXT: vadd.f32 s2, s2, s3 -; CHECK-NEXT: add.w r0, r2, r10, lsl #2 +; CHECK-NEXT: add.w r0, r2, r9, lsl #2 ; CHECK-NEXT: vadd.f32 s0, s0, s1 -; CHECK-NEXT: add r11, r9 +; CHECK-NEXT: add r11, r10 ; CHECK-NEXT: vadd.f32 s6, s6, s7 -; CHECK-NEXT: add r7, r9 +; CHECK-NEXT: add r6, r10 ; CHECK-NEXT: vadd.f32 s4, s4, s5 ; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: vadd.f32 s2, s4, s6 @@ -228,46 +228,40 @@ define void @DCT_mve3(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9} ; CHECK-NEXT: vpush {d8, d9} -; CHECK-NEXT: .pad #24 -; CHECK-NEXT: sub sp, #24 -; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] -; CHECK-NEXT: str r2, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: subs r1, #3 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo .LBB2_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr r3, [r0, #8] +; CHECK-NEXT: ldr.w r9, [r0, #8] ; CHECK-NEXT: movs r5, #1 ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: add.w r0, r3, r3, lsl #1 -; CHECK-NEXT: add.w r9, r1, r3, lsl #2 -; CHECK-NEXT: add.w r12, r1, r3, lsl #3 -; CHECK-NEXT: adds r3, #3 +; CHECK-NEXT: add.w r3, r9, #3 ; CHECK-NEXT: bic r3, r3, #3 -; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload -; CHECK-NEXT: add.w r10, r1, r0, lsl #2 +; CHECK-NEXT: add.w r0, r9, r9, lsl #1 ; CHECK-NEXT: subs r3, #4 +; CHECK-NEXT: add.w r10, r1, r9, lsl #2 +; CHECK-NEXT: add.w r12, r1, r9, lsl #3 +; CHECK-NEXT: add.w r1, r1, r0, lsl #2 +; CHECK-NEXT: add.w r3, r5, r3, lsr #2 +; CHECK-NEXT: str r3, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload ; CHECK-NEXT: lsl.w r11, r0, #2 -; CHECK-NEXT: add.w r1, r5, r3, lsr #2 -; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB2_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB2_3 Depth 2 -; CHECK-NEXT: ldr r6, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r6, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: adds r0, r5, #2 -; CHECK-NEXT: adds r2, r5, #1 -; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload -; CHECK-NEXT: mov r3, r9 -; CHECK-NEXT: str r0, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: mov r3, r10 ; CHECK-NEXT: mov r0, r12 -; CHECK-NEXT: mov r4, r10 -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: dlstp.32 lr, r7 +; CHECK-NEXT: mov r4, r1 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: dlstp.32 lr, r9 ; CHECK-NEXT: .LBB2_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB2_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 @@ -282,31 +276,31 @@ define void @DCT_mve3(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB2_2 Depth=1 ; CHECK-NEXT: vadd.f32 s10, s10, s11 -; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: adds r0, r5, #1 ; CHECK-NEXT: vadd.f32 s8, s8, s9 -; CHECK-NEXT: add r9, r11 +; CHECK-NEXT: add r10, r11 ; CHECK-NEXT: vadd.f32 s6, s6, s7 -; CHECK-NEXT: add.w r0, r1, r2, lsl #2 +; CHECK-NEXT: add.w r0, r2, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s4, s4, s5 ; CHECK-NEXT: add r12, r11 ; CHECK-NEXT: vadd.f32 s2, s2, s3 -; CHECK-NEXT: add r10, r11 +; CHECK-NEXT: add r1, r11 ; CHECK-NEXT: vadd.f32 s0, s0, s1 ; CHECK-NEXT: vadd.f32 s8, s8, s10 ; CHECK-NEXT: vadd.f32 s4, s4, s6 ; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: vstr s8, [r0] -; CHECK-NEXT: add.w r0, r1, r5, lsl #2 -; CHECK-NEXT: adds r5, #3 +; CHECK-NEXT: add.w r0, r2, r5, lsl #2 ; CHECK-NEXT: vstr s4, [r0] -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: add.w r0, r1, r0, lsl #2 +; CHECK-NEXT: adds r0, r5, #2 +; CHECK-NEXT: adds r5, #3 +; CHECK-NEXT: add.w r0, r2, r0, lsl #2 ; CHECK-NEXT: vstr s0, [r0] -; CHECK-NEXT: ldr r0, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: cmp r5, r0 ; CHECK-NEXT: blo .LBB2_2 ; CHECK-NEXT: .LBB2_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #24 +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -394,15 +388,15 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11} ; CHECK-NEXT: vpush {d8, d9, d10, d11} -; CHECK-NEXT: .pad #40 -; CHECK-NEXT: sub sp, #40 -; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: .pad #24 +; CHECK-NEXT: sub sp, #24 +; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] -; CHECK-NEXT: str r2, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: str r2, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: subs r1, #4 -; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 -; CHECK-NEXT: blo.w .LBB3_5 +; CHECK-NEXT: blo .LBB3_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader ; CHECK-NEXT: ldr r2, [r0, #8] ; CHECK-NEXT: movs r6, #1 @@ -410,34 +404,28 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: add.w r0, r2, r2, lsl #1 ; CHECK-NEXT: add.w r12, r1, r2, lsl #2 ; CHECK-NEXT: add.w r8, r1, r2, lsl #3 -; CHECK-NEXT: add.w r9, r1, r2, lsl #4 -; CHECK-NEXT: add.w r11, r1, r0, lsl #2 +; CHECK-NEXT: add.w r10, r1, r2, lsl #4 +; CHECK-NEXT: add.w r9, r1, r0, lsl #2 ; CHECK-NEXT: adds r0, r2, #3 ; CHECK-NEXT: bic r0, r0, #3 ; CHECK-NEXT: subs r0, #4 ; CHECK-NEXT: add.w r0, r6, r0, lsr #2 -; CHECK-NEXT: strd r0, r2, [sp, #8] @ 8-byte Folded Spill +; CHECK-NEXT: strd r0, r2, [sp, #4] @ 8-byte Folded Spill ; CHECK-NEXT: lsls r0, r2, #4 -; CHECK-NEXT: ldrd r2, r7, [sp, #8] @ 8-byte Folded Reload -; CHECK-NEXT: str r0, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: ldrd r2, r7, [sp, #4] @ 8-byte Folded Reload +; CHECK-NEXT: str r0, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB3_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB3_3 Depth 2 -; CHECK-NEXT: adds r0, r6, #3 -; CHECK-NEXT: str r0, [sp, #36] @ 4-byte Spill -; CHECK-NEXT: adds r0, r6, #2 -; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload ; CHECK-NEXT: vmov.i32 q0, #0x0 -; CHECK-NEXT: str r0, [sp, #32] @ 4-byte Spill -; CHECK-NEXT: adds r0, r6, #1 -; CHECK-NEXT: str r0, [sp, #28] @ 4-byte Spill ; CHECK-NEXT: mov r3, r12 ; CHECK-NEXT: mov r0, r8 -; CHECK-NEXT: mov r5, r11 -; CHECK-NEXT: mov r4, r9 -; CHECK-NEXT: vmov q1, q0 -; CHECK-NEXT: vmov q2, q0 -; CHECK-NEXT: vmov q3, q0 +; CHECK-NEXT: mov r5, r9 +; CHECK-NEXT: mov r4, r10 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.i32 q3, #0x0 ; CHECK-NEXT: dlstp.32 lr, r7 ; CHECK-NEXT: .LBB3_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB3_2 Depth=1 @@ -455,9 +443,9 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: @ %bb.4: @ %middle.block ; CHECK-NEXT: @ in Loop: Header=BB3_2 Depth=1 ; CHECK-NEXT: vadd.f32 s14, s14, s15 -; CHECK-NEXT: ldr r0, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: vadd.f32 s12, s12, s13 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: adds r0, r6, #1 ; CHECK-NEXT: vadd.f32 s10, s10, s11 ; CHECK-NEXT: vadd.f32 s8, s8, s9 ; CHECK-NEXT: add.w r0, r1, r0, lsl #2 @@ -471,24 +459,24 @@ define void @DCT_mve4(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: vadd.f32 s0, s0, s2 ; CHECK-NEXT: vstr s12, [r0] ; CHECK-NEXT: add.w r0, r1, r6, lsl #2 -; CHECK-NEXT: adds r6, #4 ; CHECK-NEXT: vstr s8, [r0] -; CHECK-NEXT: ldr r0, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: adds r0, r6, #2 ; CHECK-NEXT: add.w r0, r1, r0, lsl #2 ; CHECK-NEXT: vstr s4, [r0] -; CHECK-NEXT: ldr r0, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: adds r0, r6, #3 +; CHECK-NEXT: adds r6, #4 ; CHECK-NEXT: add.w r0, r1, r0, lsl #2 ; CHECK-NEXT: vstr s0, [r0] -; CHECK-NEXT: ldr r0, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r0, [sp] @ 4-byte Reload ; CHECK-NEXT: add r12, r0 ; CHECK-NEXT: add r8, r0 -; CHECK-NEXT: add r11, r0 ; CHECK-NEXT: add r9, r0 -; CHECK-NEXT: ldr r0, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: add r10, r0 +; CHECK-NEXT: ldr r0, [sp, #16] @ 4-byte Reload ; CHECK-NEXT: cmp r6, r0 ; CHECK-NEXT: blo .LBB3_2 ; CHECK-NEXT: .LBB3_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #40 +; CHECK-NEXT: add sp, #24 ; CHECK-NEXT: vpop {d8, d9, d10, d11} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -588,60 +576,53 @@ define void @DCT_mve5(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] ; CHECK-NEXT: subs r1, #5 -; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo.w .LBB4_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr r3, [r0, #8] +; CHECK-NEXT: ldr.w r12, [r0, #8] ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: adds r0, r3, #3 -; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: add.w r0, r12, #3 ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r8, r1, r3, lsl #2 +; CHECK-NEXT: add.w r8, r1, r12, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 -; CHECK-NEXT: lsls r5, r3, #2 +; CHECK-NEXT: lsl.w r5, r12, #2 ; CHECK-NEXT: add.w r1, r0, r1, lsr #2 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: add.w r1, r3, r3, lsl #2 -; CHECK-NEXT: lsls r1, r1, #2 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: add.w r1, r12, r12, lsl #2 +; CHECK-NEXT: lsls r1, r1, #2 +; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB4_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB4_3 Depth 2 -; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: ldr r4, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #3 -; CHECK-NEXT: add.w r10, r0, #2 +; CHECK-NEXT: ldr r4, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: add.w r11, r0, #1 -; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vmov q3, q1 -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: dlstp.32 lr, r7 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB4_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB4_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: add.w r9, r3, r5 ; CHECK-NEXT: vldrw.u32 q5, [r4], #16 ; CHECK-NEXT: vldrw.u32 q6, [r3], #16 -; CHECK-NEXT: add.w r12, r9, r5 +; CHECK-NEXT: add.w r10, r9, r5 ; CHECK-NEXT: vfma.f32 q3, q6, q5 ; CHECK-NEXT: vldrw.u32 q6, [r9] -; CHECK-NEXT: add.w r6, r12, r5 +; CHECK-NEXT: add.w r6, r10, r5 ; CHECK-NEXT: vfma.f32 q4, q6, q5 -; CHECK-NEXT: vldrw.u32 q6, [r12] +; CHECK-NEXT: vldrw.u32 q6, [r10] ; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vfma.f32 q2, q6, q5 ; CHECK-NEXT: vldrw.u32 q6, [r6] @@ -662,30 +643,31 @@ define void @DCT_mve5(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: vadd.f32 s8, s8, s9 ; CHECK-NEXT: vadd.f32 s0, s0, s1 ; CHECK-NEXT: vadd.f32 s1, s16, s18 -; CHECK-NEXT: vadd.f32 s2, s2, s3 ; CHECK-NEXT: vadd.f32 s12, s12, s14 +; CHECK-NEXT: vadd.f32 s2, s2, s3 ; CHECK-NEXT: vadd.f32 s4, s4, s6 ; CHECK-NEXT: vadd.f32 s6, s8, s10 ; CHECK-NEXT: vstr s1, [r1] ; CHECK-NEXT: add.w r1, r2, r0, lsl #2 -; CHECK-NEXT: vadd.f32 s0, s0, s2 -; CHECK-NEXT: adds r0, #5 ; CHECK-NEXT: vstr s12, [r1] -; CHECK-NEXT: add.w r1, r2, r10, lsl #2 +; CHECK-NEXT: adds r1, r0, #2 +; CHECK-NEXT: vadd.f32 s0, s0, s2 +; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s6, [r1] -; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #3 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s0, [r1] -; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #4 +; CHECK-NEXT: adds r0, #5 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s4, [r1] -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-NEXT: add r8, r1 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 -; CHECK-NEXT: blo.w .LBB4_2 +; CHECK-NEXT: blo .LBB4_2 ; CHECK-NEXT: .LBB4_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -797,63 +779,54 @@ define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 -; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: .pad #16 +; CHECK-NEXT: sub sp, #16 +; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] ; CHECK-NEXT: subs r1, #6 -; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo.w .LBB5_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr r3, [r0, #8] +; CHECK-NEXT: ldr.w r12, [r0, #8] ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: adds r0, r3, #3 -; CHECK-NEXT: str r3, [sp, #8] @ 4-byte Spill +; CHECK-NEXT: add.w r0, r12, #3 ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r8, r1, r3, lsl #2 +; CHECK-NEXT: add.w r8, r1, r12, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 -; CHECK-NEXT: lsls r5, r3, #2 +; CHECK-NEXT: lsl.w r5, r12, #2 ; CHECK-NEXT: add.w r1, r0, r1, lsr #2 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill -; CHECK-NEXT: add.w r1, r3, r3, lsl #1 +; CHECK-NEXT: add.w r1, r12, r12, lsl #1 ; CHECK-NEXT: lsls r1, r1, #3 ; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB5_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB5_3 Depth 2 -; CHECK-NEXT: adds r1, r0, #5 -; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #3 -; CHECK-NEXT: ldr r7, [sp, #8] @ 4-byte Reload -; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill -; CHECK-NEXT: vmov.i32 q1, #0x0 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload -; CHECK-NEXT: add.w r11, r0, #2 +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: adds r4, r0, #1 -; CHECK-NEXT: ldr r6, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vmov.i32 q1, #0x0 ; CHECK-NEXT: mov r3, r8 -; CHECK-NEXT: vmov q3, q1 -; CHECK-NEXT: vmov q4, q1 -; CHECK-NEXT: vmov q0, q1 -; CHECK-NEXT: vmov q5, q1 -; CHECK-NEXT: vmov q2, q1 -; CHECK-NEXT: dlstp.32 lr, r7 +; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q5, #0x0 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: dlstp.32 lr, r12 ; CHECK-NEXT: .LBB5_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB5_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 -; CHECK-NEXT: add.w r12, r3, r5 +; CHECK-NEXT: add.w r10, r3, r5 ; CHECK-NEXT: vldrw.u32 q6, [r1], #16 ; CHECK-NEXT: vldrw.u32 q7, [r3], #16 -; CHECK-NEXT: add.w r10, r12, r5 +; CHECK-NEXT: add.w r11, r10, r5 ; CHECK-NEXT: vfma.f32 q4, q7, q6 -; CHECK-NEXT: vldrw.u32 q7, [r12] -; CHECK-NEXT: add.w r6, r10, r5 -; CHECK-NEXT: vfma.f32 q5, q7, q6 ; CHECK-NEXT: vldrw.u32 q7, [r10] +; CHECK-NEXT: add.w r6, r11, r5 +; CHECK-NEXT: vfma.f32 q5, q7, q6 +; CHECK-NEXT: vldrw.u32 q7, [r11] ; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vfma.f32 q2, q7, q6 ; CHECK-NEXT: vldrw.u32 q7, [r6] @@ -885,28 +858,29 @@ define void @DCT_mve6(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: vstr s1, [r1] ; CHECK-NEXT: add.w r1, r2, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s0, s0, s2 -; CHECK-NEXT: adds r0, #6 ; CHECK-NEXT: vstr s3, [r1] -; CHECK-NEXT: add.w r1, r2, r11, lsl #2 +; CHECK-NEXT: adds r1, r0, #2 ; CHECK-NEXT: vadd.f32 s4, s4, s6 +; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s8, [r1] -; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #3 ; CHECK-NEXT: vadd.f32 s6, s12, s14 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s0, [r1] -; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #4 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s6, [r1] -; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #5 +; CHECK-NEXT: adds r0, #6 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s4, [r1] ; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-NEXT: add r8, r1 -; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB5_2 ; CHECK-NEXT: .LBB5_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #32 +; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -1030,73 +1004,64 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #72 -; CHECK-NEXT: sub sp, #72 -; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: .pad #48 +; CHECK-NEXT: sub sp, #48 +; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] ; CHECK-NEXT: subs r1, #7 -; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo.w .LBB6_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr r3, [r0, #8] +; CHECK-NEXT: ldr.w r10, [r0, #8] ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: adds r0, r3, #3 -; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: add.w r0, r10, #3 ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r9, r1, r3, lsl #2 +; CHECK-NEXT: add.w r9, r1, r10, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 -; CHECK-NEXT: lsls r5, r3, #2 +; CHECK-NEXT: lsl.w r5, r10, #2 ; CHECK-NEXT: add.w r1, r0, r1, lsr #2 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: rsb r1, r3, r3, lsl #3 -; CHECK-NEXT: lsls r1, r1, #2 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: rsb r1, r10, r10, lsl #3 +; CHECK-NEXT: lsls r1, r1, #2 +; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB6_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB6_3 Depth 2 -; CHECK-NEXT: adds r1, r0, #6 -; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #5 -; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #3 -; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill -; CHECK-NEXT: vmov.i32 q2, #0x0 -; CHECK-NEXT: ldr r1, [sp, #20] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #12] @ 4-byte Reload ; CHECK-NEXT: adds r4, r0, #2 ; CHECK-NEXT: add.w r8, r0, #1 -; CHECK-NEXT: ldr r6, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q2, #0x0 ; CHECK-NEXT: mov r3, r9 -; CHECK-NEXT: vmov q4, q2 -; CHECK-NEXT: vmov q5, q2 -; CHECK-NEXT: vmov q3, q2 -; CHECK-NEXT: vmov q6, q2 -; CHECK-NEXT: vmov q1, q2 -; CHECK-NEXT: mov r12, r7 -; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill -; CHECK-NEXT: dls lr, r6 +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vmov.i32 q5, #0x0 +; CHECK-NEXT: vmov.i32 q3, #0x0 +; CHECK-NEXT: vmov.i32 q6, #0x0 +; CHECK-NEXT: vmov.i32 q1, #0x0 +; CHECK-NEXT: mov r12, r10 +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: dls lr, r7 ; CHECK-NEXT: .LBB6_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB6_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vctp.32 r12 -; CHECK-NEXT: add.w r10, r3, r5 +; CHECK-NEXT: add.w r11, r3, r5 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q7, [r1], #16 ; CHECK-NEXT: vldrwt.u32 q0, [r3], #16 -; CHECK-NEXT: add.w r11, r10, r5 +; CHECK-NEXT: add.w r6, r11, r5 ; CHECK-NEXT: sub.w r12, r12, #4 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q5, q0, q7 -; CHECK-NEXT: vldrwt.u32 q0, [r10] -; CHECK-NEXT: add.w r6, r11, r5 +; CHECK-NEXT: vldrwt.u32 q0, [r11] +; CHECK-NEXT: adds r7, r6, r5 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q6, q0, q7 -; CHECK-NEXT: vldrwt.u32 q0, [r11] -; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill +; CHECK-NEXT: vldrwt.u32 q0, [r6] +; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vmov q6, q5 ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q1, q0, q7 @@ -1104,26 +1069,26 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vmov q3, q1 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q0, [r6] -; CHECK-NEXT: vldrw.u32 q1, [sp, #56] @ 16-byte Reload -; CHECK-NEXT: adds r7, r6, r5 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vfmat.f32 q1, q0, q7 ; CHECK-NEXT: vldrwt.u32 q0, [r7] +; CHECK-NEXT: vldrw.u32 q1, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: adds r6, r7, r5 -; CHECK-NEXT: vstrw.32 q1, [sp, #56] @ 16-byte Spill +; CHECK-NEXT: vpstt +; CHECK-NEXT: vfmat.f32 q1, q0, q7 +; CHECK-NEXT: vldrwt.u32 q0, [r6] +; CHECK-NEXT: adds r7, r6, r5 +; CHECK-NEXT: vstrw.32 q1, [sp, #32] @ 16-byte Spill ; CHECK-NEXT: vmov q1, q3 ; CHECK-NEXT: vmov q3, q4 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q3, q0, q7 -; CHECK-NEXT: vldrwt.u32 q0, [r6] +; CHECK-NEXT: vldrwt.u32 q0, [r7] ; CHECK-NEXT: vmov q4, q5 -; CHECK-NEXT: adds r7, r6, r5 +; CHECK-NEXT: adds r6, r7, r5 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q4, q0, q7 -; CHECK-NEXT: vldrwt.u32 q0, [r7] +; CHECK-NEXT: vldrwt.u32 q0, [r6] ; CHECK-NEXT: vmov q5, q6 -; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q2, q0, q7 ; CHECK-NEXT: le lr, .LBB6_3 @@ -1138,45 +1103,45 @@ define void @DCT_mve7(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: vadd.f32 s4, s4, s5 ; CHECK-NEXT: vadd.f32 s10, s10, s11 ; CHECK-NEXT: vadd.f32 s8, s8, s9 -; CHECK-NEXT: vadd.f32 s0, s2, s0 ; CHECK-NEXT: vadd.f32 s9, s18, s19 ; CHECK-NEXT: vadd.f32 s11, s16, s17 -; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload -; CHECK-NEXT: vadd.f32 s2, s3, s1 +; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload +; CHECK-NEXT: vadd.f32 s0, s2, s0 ; CHECK-NEXT: vadd.f32 s5, s18, s19 ; CHECK-NEXT: vadd.f32 s7, s16, s17 +; CHECK-NEXT: vadd.f32 s2, s3, s1 ; CHECK-NEXT: vadd.f32 s4, s4, s6 -; CHECK-NEXT: vstr s0, [r1] -; CHECK-NEXT: add.w r1, r2, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s14, s14, s15 -; CHECK-NEXT: adds r0, #7 ; CHECK-NEXT: vadd.f32 s12, s12, s13 -; CHECK-NEXT: vstr s2, [r1] -; CHECK-NEXT: add.w r1, r2, r4, lsl #2 +; CHECK-NEXT: vstr s0, [r1] +; CHECK-NEXT: add.w r1, r2, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s8, s8, s10 ; CHECK-NEXT: vadd.f32 s6, s7, s5 -; CHECK-NEXT: vstr s4, [r1] +; CHECK-NEXT: vstr s2, [r1] +; CHECK-NEXT: add.w r1, r2, r4, lsl #2 ; CHECK-NEXT: vadd.f32 s10, s11, s9 -; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload +; CHECK-NEXT: vstr s4, [r1] +; CHECK-NEXT: adds r1, r0, #3 ; CHECK-NEXT: vadd.f32 s12, s12, s14 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s6, [r1] -; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #4 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s12, [r1] -; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #5 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s10, [r1] -; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #6 +; CHECK-NEXT: adds r0, #7 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s8, [r1] -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-NEXT: add r9, r1 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB6_2 ; CHECK-NEXT: .LBB6_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #72 +; CHECK-NEXT: add sp, #48 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} @@ -1312,107 +1277,99 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: sub sp, #4 ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #88 -; CHECK-NEXT: sub sp, #88 -; CHECK-NEXT: str r1, [sp, #20] @ 4-byte Spill +; CHECK-NEXT: .pad #64 +; CHECK-NEXT: sub sp, #64 +; CHECK-NEXT: str r1, [sp, #12] @ 4-byte Spill ; CHECK-NEXT: ldr r1, [r0, #4] ; CHECK-NEXT: subs r1, #8 -; CHECK-NEXT: str r1, [sp, #16] @ 4-byte Spill +; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill ; CHECK-NEXT: cmp r1, #2 ; CHECK-NEXT: blo.w .LBB7_5 ; CHECK-NEXT: @ %bb.1: @ %for.body.preheader -; CHECK-NEXT: ldr r3, [r0, #8] +; CHECK-NEXT: ldr.w r11, [r0, #8] ; CHECK-NEXT: ldr r1, [r0] -; CHECK-NEXT: adds r0, r3, #3 -; CHECK-NEXT: str r3, [sp, #12] @ 4-byte Spill +; CHECK-NEXT: add.w r0, r11, #3 ; CHECK-NEXT: bic r0, r0, #3 -; CHECK-NEXT: add.w r12, r1, r3, lsl #2 +; CHECK-NEXT: add.w r12, r1, r11, lsl #2 ; CHECK-NEXT: subs r1, r0, #4 ; CHECK-NEXT: movs r0, #1 -; CHECK-NEXT: lsls r6, r3, #2 +; CHECK-NEXT: lsl.w r6, r11, #2 ; CHECK-NEXT: add.w r1, r0, r1, lsr #2 -; CHECK-NEXT: str r1, [sp, #8] @ 4-byte Spill -; CHECK-NEXT: lsls r1, r3, #5 ; CHECK-NEXT: str r1, [sp, #4] @ 4-byte Spill +; CHECK-NEXT: lsl.w r1, r11, #5 +; CHECK-NEXT: str r1, [sp] @ 4-byte Spill ; CHECK-NEXT: .LBB7_2: @ %for.body ; CHECK-NEXT: @ =>This Loop Header: Depth=1 ; CHECK-NEXT: @ Child Loop BB7_3 Depth 2 -; CHECK-NEXT: adds r1, r0, #7 -; CHECK-NEXT: str r1, [sp, #36] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #6 -; CHECK-NEXT: str r1, [sp, #32] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #5 -; CHECK-NEXT: ldr r7, [sp, #12] @ 4-byte Reload -; CHECK-NEXT: str r1, [sp, #28] @ 4-byte Spill -; CHECK-NEXT: adds r1, r0, #4 -; CHECK-NEXT: ldr.w r9, [sp, #20] @ 4-byte Reload -; CHECK-NEXT: vmov.i32 q3, #0x0 -; CHECK-NEXT: str r1, [sp, #24] @ 4-byte Spill +; CHECK-NEXT: ldr.w r9, [sp, #12] @ 4-byte Reload +; CHECK-NEXT: vmov.i32 q0, #0x0 ; CHECK-NEXT: adds r4, r0, #3 ; CHECK-NEXT: add.w r8, r0, #2 ; CHECK-NEXT: adds r1, r0, #1 -; CHECK-NEXT: ldr r5, [sp, #8] @ 4-byte Reload +; CHECK-NEXT: ldr r7, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: vstrw.32 q0, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vmov.i32 q0, #0x0 +; CHECK-NEXT: vmov.i32 q3, #0x0 ; CHECK-NEXT: mov r3, r12 -; CHECK-NEXT: vmov q5, q3 -; CHECK-NEXT: vmov q6, q3 -; CHECK-NEXT: vmov q4, q3 -; CHECK-NEXT: vmov q7, q3 -; CHECK-NEXT: vmov q2, q3 -; CHECK-NEXT: mov r10, r7 -; CHECK-NEXT: vstrw.32 q3, [sp, #56] @ 16-byte Spill -; CHECK-NEXT: vstrw.32 q3, [sp, #72] @ 16-byte Spill -; CHECK-NEXT: dls lr, r5 +; CHECK-NEXT: vmov.i32 q5, #0x0 +; CHECK-NEXT: vmov.i32 q6, #0x0 +; CHECK-NEXT: vmov.i32 q4, #0x0 +; CHECK-NEXT: vmov.i32 q7, #0x0 +; CHECK-NEXT: vmov.i32 q2, #0x0 +; CHECK-NEXT: mov r10, r11 +; CHECK-NEXT: vstrw.32 q0, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: dls lr, r7 ; CHECK-NEXT: .LBB7_3: @ %vector.body ; CHECK-NEXT: @ Parent Loop BB7_2 Depth=1 ; CHECK-NEXT: @ => This Inner Loop Header: Depth=2 ; CHECK-NEXT: vctp.32 r10 -; CHECK-NEXT: add.w r11, r3, r6 +; CHECK-NEXT: adds r5, r3, r6 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vldrwt.u32 q0, [r9], #16 ; CHECK-NEXT: vldrwt.u32 q1, [r3], #16 -; CHECK-NEXT: add.w r5, r11, r6 +; CHECK-NEXT: adds r7, r5, r6 ; CHECK-NEXT: sub.w r10, r10, #4 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q6, q1, q0 -; CHECK-NEXT: vldrwt.u32 q1, [r11] -; CHECK-NEXT: vstrw.32 q6, [sp, #40] @ 16-byte Spill +; CHECK-NEXT: vldrwt.u32 q1, [r5] +; CHECK-NEXT: vstrw.32 q6, [sp, #16] @ 16-byte Spill ; CHECK-NEXT: vmov q6, q5 ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q7, q1, q0 -; CHECK-NEXT: vmov q5, q3 -; CHECK-NEXT: vmov q3, q4 -; CHECK-NEXT: vmov q4, q2 +; CHECK-NEXT: vmov q5, q4 +; CHECK-NEXT: vmov q4, q3 +; CHECK-NEXT: vmov q3, q2 ; CHECK-NEXT: vpst -; CHECK-NEXT: vldrwt.u32 q1, [r5] -; CHECK-NEXT: vldrw.u32 q2, [sp, #56] @ 16-byte Reload -; CHECK-NEXT: adds r7, r5, r6 -; CHECK-NEXT: vpstt -; CHECK-NEXT: vfmat.f32 q2, q1, q0 ; CHECK-NEXT: vldrwt.u32 q1, [r7] -; CHECK-NEXT: vstrw.32 q2, [sp, #56] @ 16-byte Spill -; CHECK-NEXT: vldrw.u32 q2, [sp, #72] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q2, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: adds r5, r7, r6 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q2, q1, q0 ; CHECK-NEXT: vldrwt.u32 q1, [r5] +; CHECK-NEXT: vstrw.32 q2, [sp, #32] @ 16-byte Spill +; CHECK-NEXT: vldrw.u32 q2, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: adds r7, r5, r6 -; CHECK-NEXT: vstrw.32 q2, [sp, #72] @ 16-byte Spill -; CHECK-NEXT: vmov q2, q4 -; CHECK-NEXT: vmov q4, q3 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q2, q1, q0 ; CHECK-NEXT: vldrwt.u32 q1, [r7] ; CHECK-NEXT: adds r5, r7, r6 -; CHECK-NEXT: vmov q3, q5 +; CHECK-NEXT: vstrw.32 q2, [sp, #48] @ 16-byte Spill +; CHECK-NEXT: vmov q2, q3 +; CHECK-NEXT: vmov q3, q4 ; CHECK-NEXT: vpstt -; CHECK-NEXT: vfmat.f32 q4, q1, q0 +; CHECK-NEXT: vfmat.f32 q2, q1, q0 ; CHECK-NEXT: vldrwt.u32 q1, [r5] +; CHECK-NEXT: vmov q4, q5 +; CHECK-NEXT: adds r7, r5, r6 +; CHECK-NEXT: vpstt +; CHECK-NEXT: vfmat.f32 q4, q1, q0 +; CHECK-NEXT: vldrwt.u32 q1, [r7] ; CHECK-NEXT: vmov q5, q6 -; CHECK-NEXT: add r5, r6 +; CHECK-NEXT: adds r5, r7, r6 ; CHECK-NEXT: vpstt ; CHECK-NEXT: vfmat.f32 q5, q1, q0 ; CHECK-NEXT: vldrwt.u32 q1, [r5] -; CHECK-NEXT: vldrw.u32 q6, [sp, #40] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q6, [sp, #16] @ 16-byte Reload ; CHECK-NEXT: vpst ; CHECK-NEXT: vfmat.f32 q3, q1, q0 ; CHECK-NEXT: le lr, .LBB7_3 @@ -1425,12 +1382,12 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: vadd.f32 s6, s24, s25 ; CHECK-NEXT: vadd.f32 s5, s18, s19 ; CHECK-NEXT: vadd.f32 s7, s16, s17 -; CHECK-NEXT: vldrw.u32 q4, [sp, #56] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [sp, #32] @ 16-byte Reload ; CHECK-NEXT: vadd.f32 s10, s10, s11 ; CHECK-NEXT: vadd.f32 s8, s8, s9 ; CHECK-NEXT: vadd.f32 s9, s18, s19 ; CHECK-NEXT: vadd.f32 s11, s16, s17 -; CHECK-NEXT: vldrw.u32 q4, [sp, #72] @ 16-byte Reload +; CHECK-NEXT: vldrw.u32 q4, [sp, #48] @ 16-byte Reload ; CHECK-NEXT: vadd.f32 s14, s14, s15 ; CHECK-NEXT: vadd.f32 s12, s12, s13 ; CHECK-NEXT: vadd.f32 s13, s18, s19 @@ -1445,33 +1402,33 @@ define void @DCT_mve8(ptr nocapture readonly %S, ptr nocapture readonly %pIn, pt ; CHECK-NEXT: vstr s0, [r1] ; CHECK-NEXT: add.w r1, r2, r0, lsl #2 ; CHECK-NEXT: vadd.f32 s3, s20, s21 -; CHECK-NEXT: adds r0, #8 ; CHECK-NEXT: vstr s2, [r1] ; CHECK-NEXT: add.w r1, r2, r8, lsl #2 ; CHECK-NEXT: vadd.f32 s12, s7, s5 ; CHECK-NEXT: vstr s10, [r1] ; CHECK-NEXT: add.w r1, r2, r4, lsl #2 ; CHECK-NEXT: vstr s14, [r1] -; CHECK-NEXT: ldr r1, [sp, #24] @ 4-byte Reload -; CHECK-NEXT: vadd.f32 s4, s3, s1 +; CHECK-NEXT: adds r1, r0, #4 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 +; CHECK-NEXT: vadd.f32 s4, s3, s1 ; CHECK-NEXT: vstr s8, [r1] -; CHECK-NEXT: ldr r1, [sp, #28] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #5 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s12, [r1] -; CHECK-NEXT: ldr r1, [sp, #32] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #6 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s4, [r1] -; CHECK-NEXT: ldr r1, [sp, #36] @ 4-byte Reload +; CHECK-NEXT: adds r1, r0, #7 +; CHECK-NEXT: adds r0, #8 ; CHECK-NEXT: add.w r1, r2, r1, lsl #2 ; CHECK-NEXT: vstr s6, [r1] -; CHECK-NEXT: ldr r1, [sp, #4] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp] @ 4-byte Reload ; CHECK-NEXT: add r12, r1 -; CHECK-NEXT: ldr r1, [sp, #16] @ 4-byte Reload +; CHECK-NEXT: ldr r1, [sp, #8] @ 4-byte Reload ; CHECK-NEXT: cmp r0, r1 ; CHECK-NEXT: blo.w .LBB7_2 ; CHECK-NEXT: .LBB7_5: @ %for.cond.cleanup -; CHECK-NEXT: add sp, #88 +; CHECK-NEXT: add sp, #64 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: add sp, #4 ; CHECK-NEXT: pop.w {r4, r5, r6, r7, r8, r9, r10, r11, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll b/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll index 29c4fb9..413c4a1 100644 --- a/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll +++ b/llvm/test/CodeGen/Thumb2/mve-qrintrsplat.ll @@ -1496,15 +1496,14 @@ define void @vfmasq(ptr %x, ptr %y, i32 %n) { ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB34_1: @ %for.body.preheader -; CHECK-NEXT: vmov.f32 q0, #1.000000e+01 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB34_2: @ %for.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vldrw.u32 q2, [r0], #16 -; CHECK-NEXT: vfma.f32 q3, q2, q1 -; CHECK-NEXT: vstrw.32 q3, [r1], #16 +; CHECK-NEXT: vmov.f32 q2, #1.000000e+01 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0], #16 +; CHECK-NEXT: vfma.f32 q2, q1, q0 +; CHECK-NEXT: vstrw.32 q2, [r1], #16 ; CHECK-NEXT: letp lr, .LBB34_2 ; CHECK-NEXT: @ %bb.3: @ %for.cond.cleanup ; CHECK-NEXT: pop {r7, pc} @@ -1542,15 +1541,14 @@ define void @vfmas(ptr %s1, ptr %s2, i32 %N) { ; CHECK-NEXT: it lt ; CHECK-NEXT: poplt {r7, pc} ; CHECK-NEXT: .LBB35_1: @ %while.body.lr.ph -; CHECK-NEXT: vmov.f32 q0, #1.000000e+01 ; CHECK-NEXT: dlstp.32 lr, r2 ; CHECK-NEXT: .LBB35_2: @ %while.body ; CHECK-NEXT: @ =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vldrw.u32 q1, [r1] -; CHECK-NEXT: vldrw.u32 q2, [r0] -; CHECK-NEXT: vfma.f32 q3, q2, q1 -; CHECK-NEXT: vstrw.32 q3, [r0], #16 +; CHECK-NEXT: vmov.f32 q2, #1.000000e+01 +; CHECK-NEXT: vldrw.u32 q0, [r1] +; CHECK-NEXT: vldrw.u32 q1, [r0] +; CHECK-NEXT: vfma.f32 q2, q1, q0 +; CHECK-NEXT: vstrw.32 q2, [r0], #16 ; CHECK-NEXT: letp lr, .LBB35_2 ; CHECK-NEXT: @ %bb.3: @ %while.end ; CHECK-NEXT: pop {r7, pc} diff --git a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll index e845070..62482c1 100644 --- a/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll +++ b/llvm/test/CodeGen/Thumb2/mve-scatter-increment.ll @@ -287,17 +287,17 @@ define void @shlor(ptr nocapture readonly %x, ptr noalias nocapture %y, i32 %n) ; CHECK-NEXT: cmp r2, #1 ; CHECK-NEXT: blt .LBB5_3 ; CHECK-NEXT: @ %bb.1: @ %vector.ph -; CHECK-NEXT: adr.w lr, .LCPI5_0 -; CHECK-NEXT: adr r4, .LCPI5_1 +; CHECK-NEXT: adr r4, .LCPI5_0 +; CHECK-NEXT: adr r3, .LCPI5_1 ; CHECK-NEXT: adr r5, .LCPI5_2 ; CHECK-NEXT: adr r6, .LCPI5_3 -; CHECK-NEXT: vldrw.u32 q2, [r4] +; CHECK-NEXT: vldrw.u32 q2, [r3] +; CHECK-NEXT: vldrw.u32 q3, [r4] ; CHECK-NEXT: vldrw.u32 q0, [r6] ; CHECK-NEXT: vldrw.u32 q1, [r5] -; CHECK-NEXT: vldrw.u32 q3, [lr] +; CHECK-NEXT: vadd.i32 q2, q2, r1 ; CHECK-NEXT: vadd.i32 q0, q0, r1 ; CHECK-NEXT: vadd.i32 q1, q1, r1 -; CHECK-NEXT: vadd.i32 q2, q2, r1 ; CHECK-NEXT: vadd.i32 q3, q3, r1 ; CHECK-NEXT: mov.w r12, #1 ; CHECK-NEXT: movs r4, #3 diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll index f9948db..c92c2be 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-addpred.ll @@ -656,14 +656,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) { ; CHECK: @ %bb.0: @ %entry ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vcmp.i8 eq, q1, zr ; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vmov.i8 q1, #0xff ; CHECK-NEXT: vpsel q5, q1, q0 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vmov.u8 r0, q5[0] ; CHECK-NEXT: vmov.16 q3[0], r0 ; CHECK-NEXT: vmov.u8 r0, q5[1] @@ -706,7 +704,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) { ; CHECK-NEXT: orrs r1, r3 ; CHECK-NEXT: add r0, r2 ; CHECK-NEXT: vmov r2, r3, d15 -; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q7, #0x0 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 ; CHECK-NEXT: vmov.u8 r2, q2[3] @@ -785,6 +783,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) { ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vpsel q6, q1, q7 +; CHECK-NEXT: vmov.i8 q7, #0x0 ; CHECK-NEXT: vmov r2, r3, d12 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 @@ -853,7 +852,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %b) { ; CHECK-NEXT: vmov r2, r3, d1 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: bx lr entry: @@ -2065,14 +2063,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vmov q2, q0 ; CHECK-NEXT: vcmp.i8 eq, q1, zr ; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vmov.i8 q1, #0xff ; CHECK-NEXT: vpsel q5, q1, q0 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vmov.u8 r2, q5[0] ; CHECK-NEXT: vmov.16 q3[0], r2 ; CHECK-NEXT: vmov.u8 r2, q5[1] @@ -2115,7 +2111,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: orr.w lr, lr, r3 ; CHECK-NEXT: add r12, r2 ; CHECK-NEXT: vmov r3, r2, d15 -; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q7, #0x0 ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vmov.u8 r2, q2[3] @@ -2194,6 +2190,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q0, zr ; CHECK-NEXT: vpsel q6, q1, q7 +; CHECK-NEXT: vmov.i8 q7, #0x0 ; CHECK-NEXT: vmov r2, r3, d12 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 @@ -2264,7 +2261,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %b ; CHECK-NEXT: adc.w r3, r3, lr ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r7, pc} entry: diff --git a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll index 63b1431..9f55183 100644 --- a/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll +++ b/llvm/test/CodeGen/Thumb2/mve-vecreduce-mlapred.ll @@ -817,16 +817,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vcmp.i8 eq, q2, zr +; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vmov.i8 q2, #0xff ; CHECK-NEXT: vpsel q6, q2, q0 -; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov.i8 q2, #0xff ; CHECK-NEXT: vmov.u8 r0, q6[0] -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.i8 q4, #0x0 ; CHECK-NEXT: vmov.16 q0[0], r0 ; CHECK-NEXT: vmov.u8 r0, q6[1] ; CHECK-NEXT: vmov.16 q0[1], r0 @@ -842,9 +840,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov.16 q0[6], r0 ; CHECK-NEXT: vmov.u8 r0, q6[7] ; CHECK-NEXT: vmov.16 q0[7], r0 -; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vcmp.i16 ne, q0, zr ; CHECK-NEXT: vmov.u8 r2, q3[0] +; CHECK-NEXT: vcmp.i16 ne, q0, zr ; CHECK-NEXT: vpsel q7, q2, q4 ; CHECK-NEXT: vmov.u16 r0, q7[2] ; CHECK-NEXT: vmov.u16 r1, q7[0] @@ -895,7 +892,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q4, #0xff ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: umull r0, r2, r0, r2 ; CHECK-NEXT: umull r1, r3, r1, r3 @@ -916,8 +913,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov.u16 r3, q7[5] ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q0, q4, q0 ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 ; CHECK-NEXT: vmov q4[3], q4[1], r2, r3 @@ -932,7 +929,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov r2, s18 ; CHECK-NEXT: vmov r0, s30 ; CHECK-NEXT: vmov r1, s28 -; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q7, #0xff ; CHECK-NEXT: vmov r3, s16 ; CHECK-NEXT: umull r0, r2, r0, r2 ; CHECK-NEXT: umull r1, r3, r1, r3 @@ -960,7 +957,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q4, #0x0 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: umull r0, r2, r0, r2 ; CHECK-NEXT: umull r1, r3, r1, r3 @@ -1041,7 +1038,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r0, s18 ; CHECK-NEXT: vmov r1, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q4, #0x0 ; CHECK-NEXT: vmov r3, s0 ; CHECK-NEXT: umull r0, r2, r0, r2 ; CHECK-NEXT: umull r1, r3, r1, r3 @@ -1062,7 +1059,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov.u16 r3, q6[5] ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q0, #0xff ; CHECK-NEXT: vpsel q0, q0, q4 ; CHECK-NEXT: vmov r2, r3, d0 ; CHECK-NEXT: vmov q4[2], q4[0], r2, r3 @@ -1117,7 +1114,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_zext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: adc.w r1, r1, lr ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r7, pc} entry: @@ -1137,16 +1133,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: push {r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: vcmp.i8 eq, q2, zr ; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vmov.i8 q2, #0xff ; CHECK-NEXT: vpsel q5, q2, q0 -; CHECK-NEXT: vmov.s8 r2, q1[0] +; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vmov.u8 r0, q5[0] -; CHECK-NEXT: vmov.s8 r3, q3[0] +; CHECK-NEXT: vmov.s8 r2, q1[0] ; CHECK-NEXT: vmov.16 q4[0], r0 ; CHECK-NEXT: vmov.u8 r0, q5[1] ; CHECK-NEXT: vmov.16 q4[1], r0 @@ -1162,9 +1156,9 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov.16 q4[6], r0 ; CHECK-NEXT: vmov.u8 r0, q5[7] ; CHECK-NEXT: vmov.16 q4[7], r0 -; CHECK-NEXT: smull r2, r3, r3, r2 +; CHECK-NEXT: vmov.s8 r3, q3[0] ; CHECK-NEXT: vcmp.i16 ne, q4, zr -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: smull r2, r3, r3, r2 ; CHECK-NEXT: vpsel q6, q2, q0 ; CHECK-NEXT: vmov.u16 r0, q6[2] ; CHECK-NEXT: vmov.u16 r1, q6[0] @@ -1198,7 +1192,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov.s8 r3, q3[3] ; CHECK-NEXT: smull r0, r1, r1, r0 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q7, #0x0 ; CHECK-NEXT: smull r2, r3, r3, r2 ; CHECK-NEXT: vmov q0[2], q0[0], r0, r2 ; CHECK-NEXT: vmov q0[3], q0[1], r1, r3 @@ -1219,7 +1213,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: smull r0, r1, r1, r0 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vpsel q6, q2, q7 +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q6, q2, q0 ; CHECK-NEXT: vmov r2, r3, d12 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 @@ -1273,17 +1268,18 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: vmov.16 q6[7], r2 ; CHECK-NEXT: vmov.s8 r0, q1[8] ; CHECK-NEXT: vcmp.i16 ne, q6, zr +; CHECK-NEXT: vmov.i8 q6, #0x0 +; CHECK-NEXT: vpsel q5, q2, q6 ; CHECK-NEXT: vmov.s8 r1, q3[8] -; CHECK-NEXT: vpsel q5, q2, q7 -; CHECK-NEXT: smull r0, r1, r1, r0 ; CHECK-NEXT: vmov.u16 r2, q5[2] ; CHECK-NEXT: vmov.u16 r3, q5[0] ; CHECK-NEXT: vmov q0[2], q0[0], r3, r2 ; CHECK-NEXT: vmov.u16 r2, q5[3] ; CHECK-NEXT: vmov.u16 r3, q5[1] +; CHECK-NEXT: smull r0, r1, r1, r0 ; CHECK-NEXT: vmov q0[3], q0[1], r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vpsel q6, q2, q7 +; CHECK-NEXT: vpsel q6, q2, q6 ; CHECK-NEXT: vmov r2, r3, d12 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 @@ -1365,7 +1361,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_sext(<16 x i8> %x, <16 x i8> %y, <1 ; CHECK-NEXT: adc.w r1, r1, lr ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r7, pc} entry: @@ -2296,16 +2291,14 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #32 -; CHECK-NEXT: sub sp, #32 ; CHECK-NEXT: vmov q3, q0 -; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vcmp.i8 eq, q2, zr +; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vmov.i8 q2, #0xff ; CHECK-NEXT: vpsel q6, q2, q0 -; CHECK-NEXT: vmov q4, q0 +; CHECK-NEXT: vmov.i8 q2, #0xff ; CHECK-NEXT: vmov.u8 r2, q6[0] -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.i8 q4, #0x0 ; CHECK-NEXT: vmov.16 q0[0], r2 ; CHECK-NEXT: vmov.u8 r2, q6[1] ; CHECK-NEXT: vmov.16 q0[1], r2 @@ -2321,9 +2314,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov.16 q0[6], r2 ; CHECK-NEXT: vmov.u8 r2, q6[7] ; CHECK-NEXT: vmov.16 q0[7], r2 -; CHECK-NEXT: vstrw.32 q2, [sp, #16] @ 16-byte Spill -; CHECK-NEXT: vcmp.i16 ne, q0, zr ; CHECK-NEXT: vmov.u8 r4, q3[2] +; CHECK-NEXT: vcmp.i16 ne, q0, zr ; CHECK-NEXT: vpsel q7, q2, q4 ; CHECK-NEXT: vmov.u16 r2, q7[2] ; CHECK-NEXT: vmov.u16 r3, q7[0] @@ -2374,7 +2366,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov r2, s2 ; CHECK-NEXT: vmov r3, s18 ; CHECK-NEXT: vmov r5, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q4, #0xff ; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: umull r2, r3, r3, r2 ; CHECK-NEXT: umull r4, r5, r5, r4 @@ -2395,8 +2387,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov.u16 r4, q7[5] ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload -; CHECK-NEXT: vpsel q0, q0, q4 +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q0, q4, q0 ; CHECK-NEXT: vmov r5, r4, d0 ; CHECK-NEXT: vmov q4[2], q4[0], r5, r4 ; CHECK-NEXT: vmov q4[3], q4[1], r5, r4 @@ -2411,7 +2403,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov r5, s18 ; CHECK-NEXT: vmov r2, s30 ; CHECK-NEXT: vmov r3, s28 -; CHECK-NEXT: vldrw.u32 q7, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q7, #0xff ; CHECK-NEXT: vmov r4, s16 ; CHECK-NEXT: umull r2, r5, r2, r5 ; CHECK-NEXT: umull r3, r4, r3, r4 @@ -2439,7 +2431,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov r5, s2 ; CHECK-NEXT: vmov r2, s18 ; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q4, #0x0 ; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: umull r2, r5, r2, r5 ; CHECK-NEXT: umull r3, r4, r3, r4 @@ -2520,7 +2512,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov r5, s2 ; CHECK-NEXT: vmov r2, s18 ; CHECK-NEXT: vmov r3, s16 -; CHECK-NEXT: vldrw.u32 q4, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q4, #0x0 ; CHECK-NEXT: vmov r4, s0 ; CHECK-NEXT: umull r2, r5, r2, r5 ; CHECK-NEXT: umull r3, r4, r3, r4 @@ -2541,7 +2533,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov.u16 r4, q6[5] ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vldrw.u32 q0, [sp, #16] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q0, #0xff ; CHECK-NEXT: vpsel q0, q0, q4 ; CHECK-NEXT: vmov r5, r4, d0 ; CHECK-NEXT: vmov q4[2], q4[0], r5, r4 @@ -2598,7 +2590,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_zext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: adcs r3, r4 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: add sp, #32 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: @@ -2619,14 +2610,12 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: push {r4, r5, r7, lr} ; CHECK-NEXT: .vsave {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: vpush {d8, d9, d10, d11, d12, d13, d14, d15} -; CHECK-NEXT: .pad #16 -; CHECK-NEXT: sub sp, #16 ; CHECK-NEXT: vmov q3, q0 ; CHECK-NEXT: vcmp.i8 eq, q2, zr ; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vmov.i8 q2, #0xff ; CHECK-NEXT: vpsel q5, q2, q0 -; CHECK-NEXT: vstrw.32 q0, [sp] @ 16-byte Spill +; CHECK-NEXT: vmov.i8 q0, #0x0 ; CHECK-NEXT: vmov.u8 r2, q5[0] ; CHECK-NEXT: vmov.s8 r4, q1[2] ; CHECK-NEXT: vmov.16 q4[0], r2 @@ -2676,7 +2665,7 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: adc.w r12, r12, r2 ; CHECK-NEXT: vmov r2, r3, d15 ; CHECK-NEXT: vmov q0[2], q0[0], r2, r3 -; CHECK-NEXT: vldrw.u32 q7, [sp] @ 16-byte Reload +; CHECK-NEXT: vmov.i8 q7, #0x0 ; CHECK-NEXT: vmov q0[3], q0[1], r2, r3 ; CHECK-NEXT: vmov.s8 r2, q1[3] ; CHECK-NEXT: vmov.s8 r3, q3[3] @@ -2701,7 +2690,8 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 ; CHECK-NEXT: smull r2, r3, r3, r2 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vpsel q6, q2, q7 +; CHECK-NEXT: vmov.i8 q0, #0x0 +; CHECK-NEXT: vpsel q6, q2, q0 ; CHECK-NEXT: vmov r5, r4, d12 ; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 @@ -2755,17 +2745,18 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: vmov.16 q6[7], r5 ; CHECK-NEXT: vmov.s8 r2, q1[8] ; CHECK-NEXT: vcmp.i16 ne, q6, zr +; CHECK-NEXT: vmov.i8 q6, #0x0 +; CHECK-NEXT: vpsel q5, q2, q6 ; CHECK-NEXT: vmov.s8 r3, q3[8] -; CHECK-NEXT: vpsel q5, q2, q7 -; CHECK-NEXT: smull r2, r3, r3, r2 ; CHECK-NEXT: vmov.u16 r5, q5[2] ; CHECK-NEXT: vmov.u16 r4, q5[0] ; CHECK-NEXT: vmov q0[2], q0[0], r4, r5 ; CHECK-NEXT: vmov.u16 r5, q5[3] ; CHECK-NEXT: vmov.u16 r4, q5[1] +; CHECK-NEXT: smull r2, r3, r3, r2 ; CHECK-NEXT: vmov q0[3], q0[1], r4, r5 ; CHECK-NEXT: vcmp.i32 ne, q0, zr -; CHECK-NEXT: vpsel q6, q2, q7 +; CHECK-NEXT: vpsel q6, q2, q6 ; CHECK-NEXT: vmov r5, r4, d12 ; CHECK-NEXT: vmov q0[2], q0[0], r5, r4 ; CHECK-NEXT: vmov q0[3], q0[1], r5, r4 @@ -2849,7 +2840,6 @@ define arm_aapcs_vfpcc i64 @add_v16i8_v16i64_acc_sext(<16 x i8> %x, <16 x i8> %y ; CHECK-NEXT: adcs r3, r4 ; CHECK-NEXT: adds r0, r0, r2 ; CHECK-NEXT: adcs r1, r3 -; CHECK-NEXT: add sp, #16 ; CHECK-NEXT: vpop {d8, d9, d10, d11, d12, d13, d14, d15} ; CHECK-NEXT: pop {r4, r5, r7, pc} entry: diff --git a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll index 0c349c3..cba394f 100644 --- a/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll +++ b/llvm/test/CodeGen/X86/AMX/amx-greedy-ra-spill-shape.ll @@ -59,18 +59,18 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row ; CHECK-NEXT: [[MOVSX64rm32_:%[0-9]+]]:gr64_nosp = MOVSX64rm32 %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 8) ; CHECK-NEXT: [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[MOV32rm2]].sub_32bit ; CHECK-NEXT: [[MOVSX64rm32_1:%[0-9]+]]:gr64 = MOVSX64rm32 %fixed-stack.1, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.1, align 16) - ; CHECK-NEXT: MOV64mr %stack.5, 1, $noreg, 0, $noreg, [[MOVSX64rm32_1]] :: (store (s64) into %stack.5) ; CHECK-NEXT: [[MOVSX64rr32_4:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm1]] - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr32 = COPY [[MOV32rm3]] - ; CHECK-NEXT: [[MOVSX64rr32_5:%[0-9]+]]:gr64 = MOVSX64rr32 [[COPY6]] + ; CHECK-NEXT: [[MOVSX64rr32_5:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm3]] ; CHECK-NEXT: [[MOVSX64rr32_6:%[0-9]+]]:gr64 = MOVSX64rr32 [[MOV32rm]] ; CHECK-NEXT: MOV64mr %stack.8, 1, $noreg, 0, $noreg, [[MOVSX64rr32_6]] :: (store (s64) into %stack.8) + ; CHECK-NEXT: MOV64mr %stack.5, 1, $noreg, 0, $noreg, [[COPY1]] :: (store (s64) into %stack.5) ; CHECK-NEXT: MOV64mr %stack.6, 1, $noreg, 0, $noreg, [[MOVSX64rr32_4]] :: (store (s64) into %stack.6) - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_nosp = COPY [[MOVSX64rr32_4]] - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_nosp = IMUL64rr [[COPY7]], [[MOVSX64rr32_2]], implicit-def dead $eflags - ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_nosp = ADD64rr [[COPY7]], [[MOVSX64rm32_]], implicit-def dead $eflags - ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r [[COPY]], 4, [[COPY7]], 0, $noreg + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64_nosp = COPY [[MOVSX64rr32_4]] + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64_nosp = IMUL64rr [[COPY6]], [[MOVSX64rr32_2]], implicit-def dead $eflags + ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr64_nosp = ADD64rr [[COPY6]], [[MOVSX64rm32_]], implicit-def dead $eflags + ; CHECK-NEXT: [[LEA64r:%[0-9]+]]:gr64 = LEA64r [[COPY]], 4, [[COPY6]], 0, $noreg ; CHECK-NEXT: MOV64mr %stack.9, 1, $noreg, 0, $noreg, [[LEA64r]] :: (store (s64) into %stack.9) + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_with_sub_8bit = COPY [[MOV32rm2]] ; CHECK-NEXT: MOV64mr %stack.7, 1, $noreg, 0, $noreg, [[MOVSX64rr32_5]] :: (store (s64) into %stack.7) ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_5]] ; CHECK-NEXT: [[COPY8:%[0-9]+]]:gr64 = IMUL64rr [[COPY8]], [[MOVSX64rr32_2]], implicit-def dead $eflags @@ -87,8 +87,11 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: [[MOV32rm4:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.2, align 8) ; CHECK-NEXT: CMP32rm [[MOV32rm4]], %fixed-stack.1, 1, $noreg, 0, $noreg, implicit-def $eflags :: (load (s32) from %fixed-stack.1, align 16) - ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3) - ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5) + ; CHECK-NEXT: [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm %stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %stack.2) + ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64_nosp = MOV64rm %stack.3, 1, $noreg, 0, $noreg :: (load (s64) from %stack.3) + ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11) + ; CHECK-NEXT: [[MOV32rm6:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.3, align 16) + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr32 = COPY [[MOV32rm6]] ; CHECK-NEXT: JCC_1 %bb.5, 13, implicit $eflags ; CHECK-NEXT: JMP_1 %bb.3 ; CHECK-NEXT: {{ $}} @@ -98,9 +101,8 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = MOV64rm %stack.6, 1, $noreg, 0, $noreg :: (load (s64) from %stack.6) ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = nsw IMUL64rr [[MOV64rm2]], [[MOVSX64rr32_]], implicit-def dead $eflags ; CHECK-NEXT: [[MOV64rm2:%[0-9]+]]:gr64 = ADD64rm [[MOV64rm2]], %stack.1, 1, $noreg, 0, $noreg, implicit-def dead $eflags :: (load (s64) from %stack.1) - ; CHECK-NEXT: MOV64mr %stack.13, 1, $noreg, 0, $noreg, [[MOV64rm2]] :: (store (s64) into %stack.13) - ; CHECK-NEXT: [[MOV32rm5:%[0-9]+]]:gr32 = MOV32rm %stack.12, 1, $noreg, 0, $noreg :: (load (s32) from %stack.12) - ; CHECK-NEXT: undef [[COPY9:%[0-9]+]].sub_32bit:gr64_nosp = COPY [[MOV32rm5]] + ; CHECK-NEXT: [[MOV32rm7:%[0-9]+]]:gr32 = MOV32rm %stack.12, 1, $noreg, 0, $noreg :: (load (s32) from %stack.12) + ; CHECK-NEXT: undef [[COPY10:%[0-9]+]].sub_32bit:gr64_nosp = COPY [[MOV32rm7]] ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = MOV64rm %stack.9, 1, $noreg, 0, $noreg :: (load (s64) from %stack.9) ; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64 = MOV64rm %stack.4, 1, $noreg, 0, $noreg :: (load (s64) from %stack.4) ; CHECK-NEXT: JMP_1 %bb.6 @@ -123,40 +125,30 @@ define void @foo(i32 %M, i32 %N, i32 %K, ptr %A, ptr %B_rcr4, ptr %C, i32 %c_row ; CHECK-NEXT: bb.6.for.body17: ; CHECK-NEXT: successors: %bb.6(0x7c000000), %bb.5(0x04000000) ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY6]].sub_16bit, [[MOV32rm2]].sub_16bit - ; CHECK-NEXT: [[MOV64rm7:%[0-9]+]]:gr64 = MOV64rm %stack.13, 1, $noreg, 0, $noreg :: (load (s64) from %stack.13) - ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[COPY6]].sub_16bit, [[COPY4]].sub_16bit, [[MOV64rm7]], 1, [[MOVSX64rr32_]], 0, $noreg - ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[COPY9]].sub_32bit - ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gr32 = COPY [[LEA64_32r1]] - ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gr64 = COPY [[MOV64rm1]] - ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gr32 = COPY [[COPY4]] - ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gr32 = COPY [[COPY6]] - ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_3]] - ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_2]] - ; CHECK-NEXT: [[COPY16:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_]] - ; CHECK-NEXT: [[COPY17:%[0-9]+]]:gr64 = COPY [[MOV32rm2]] - ; CHECK-NEXT: [[COPY18:%[0-9]+]]:gr64 = COPY [[COPY1]] - ; CHECK-NEXT: [[LEA64r2:%[0-9]+]]:gr64 = LEA64r [[COPY18]], 1, [[COPY9]], 0, $noreg - ; CHECK-NEXT: [[COPY19:%[0-9]+]]:gr64_nosp = COPY [[MOV64rm]] - ; CHECK-NEXT: [[MOV32rm6:%[0-9]+]]:gr32 = MOV32rm %stack.2, 1, $noreg, 0, $noreg :: (load (s32) from %stack.2) - ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm6]].sub_16bit, [[COPY10]].sub_16bit, [[LEA64r2]], 1, [[COPY19]], 0, $noreg - ; CHECK-NEXT: [[MOV64rm:%[0-9]+]]:gr64 = COPY [[COPY19]] - ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr64 = COPY [[COPY18]] - ; CHECK-NEXT: [[MOV32rm2:%[0-9]+]]:gr64_nosp = COPY [[COPY17]] - ; CHECK-NEXT: [[MOVSX64rr32_:%[0-9]+]]:gr64_nosp = COPY [[COPY16]] - ; CHECK-NEXT: [[MOVSX64rr32_2:%[0-9]+]]:gr64_nosp = COPY [[COPY15]] - ; CHECK-NEXT: [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = COPY [[COPY14]] - ; CHECK-NEXT: [[COPY6:%[0-9]+]]:gr32 = COPY [[COPY13]] - ; CHECK-NEXT: [[COPY4:%[0-9]+]]:gr32 = COPY [[COPY12]] - ; CHECK-NEXT: [[MOV64rm1:%[0-9]+]]:gr64 = COPY [[COPY11]] - ; CHECK-NEXT: [[MOV64rm8:%[0-9]+]]:gr64 = MOV64rm %stack.11, 1, $noreg, 0, $noreg :: (load (s64) from %stack.11) - ; CHECK-NEXT: [[LEA64_32r1:%[0-9]+]]:gr32 = COPY [[COPY10]] - ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY6]].sub_16bit, [[LEA64_32r1]].sub_16bit, [[COPY4]].sub_16bit, [[PTILEZEROV]], [[PTILELOADDV]], [[PTILELOADDV1]] - ; CHECK-NEXT: PTILESTOREDV [[COPY6]].sub_16bit, [[MOV32rm2]].sub_16bit, [[MOV64rm3]], 1, [[MOVSX64rr32_2]], 0, $noreg, [[PTILEZEROV]] + ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTILEZEROV [[COPY9]].sub_16bit, [[COPY7]].sub_16bit + ; CHECK-NEXT: [[PTILELOADDV:%[0-9]+]]:tile = PTILELOADDV [[COPY9]].sub_16bit, [[COPY4]].sub_16bit, [[MOV64rm2]], 1, [[MOVSX64rr32_]], 0, $noreg + ; CHECK-NEXT: [[COPY10:%[0-9]+]]:gr64_nosp = MOVSX64rr32 [[COPY10]].sub_32bit + ; CHECK-NEXT: [[COPY11:%[0-9]+]]:gr64 = COPY [[MOVSX64rm32_1]] + ; CHECK-NEXT: [[COPY12:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_3]] + ; CHECK-NEXT: [[COPY13:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_2]] + ; CHECK-NEXT: [[COPY14:%[0-9]+]]:gr64 = COPY [[MOVSX64rr32_]] + ; CHECK-NEXT: [[COPY15:%[0-9]+]]:gr64 = COPY [[COPY7]] + ; CHECK-NEXT: [[MOV64rm7:%[0-9]+]]:gr64 = MOV64rm %stack.5, 1, $noreg, 0, $noreg :: (load (s64) from %stack.5) + ; CHECK-NEXT: [[LEA64r2:%[0-9]+]]:gr64 = LEA64r [[MOV64rm7]], 1, [[COPY10]], 0, $noreg + ; CHECK-NEXT: [[PTILELOADDV1:%[0-9]+]]:tile = PTILELOADDV [[MOV32rm5]].sub_16bit, [[LEA64_32r1]].sub_16bit, [[LEA64r2]], 1, [[MOV64rm]], 0, $noreg + ; CHECK-NEXT: [[COPY7:%[0-9]+]]:gr64_with_sub_8bit = COPY [[COPY15]] + ; CHECK-NEXT: [[MOVSX64rr32_:%[0-9]+]]:gr64_nosp = COPY [[COPY14]] + ; CHECK-NEXT: [[MOVSX64rr32_2:%[0-9]+]]:gr64_nosp = COPY [[COPY13]] + ; CHECK-NEXT: [[MOVSX64rr32_3:%[0-9]+]]:gr64_nosp = COPY [[COPY12]] + ; CHECK-NEXT: [[MOVSX64rm32_1:%[0-9]+]]:gr64 = COPY [[COPY11]] + ; CHECK-NEXT: [[MOV32rm8:%[0-9]+]]:gr32 = MOV32rm %fixed-stack.3, 1, $noreg, 0, $noreg :: (load (s32) from %fixed-stack.3, align 16) + ; CHECK-NEXT: [[COPY9:%[0-9]+]]:gr32 = COPY [[MOV32rm8]] + ; CHECK-NEXT: [[PTILEZEROV:%[0-9]+]]:tile = PTDPBSSDV [[COPY9]].sub_16bit, [[LEA64_32r1]].sub_16bit, [[COPY4]].sub_16bit, [[PTILEZEROV]], [[PTILELOADDV]], [[PTILELOADDV1]] + ; CHECK-NEXT: PTILESTOREDV [[COPY9]].sub_16bit, [[COPY7]].sub_16bit, [[MOV64rm3]], 1, [[MOVSX64rr32_2]], 0, $noreg, [[PTILEZEROV]] ; CHECK-NEXT: [[MOV64rm4:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm4]], [[MOVSX64rr32_3]], implicit-def dead $eflags - ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm3]], [[MOV64rm8]], implicit-def dead $eflags - ; CHECK-NEXT: [[COPY9:%[0-9]+]].sub_32bit:gr64_nosp = ADD32rr [[COPY9]].sub_32bit, [[LEA64_32r1]], implicit-def dead $eflags - ; CHECK-NEXT: CMP64rr [[MOV64rm4]], [[MOV64rm1]], implicit-def $eflags + ; CHECK-NEXT: [[MOV64rm3:%[0-9]+]]:gr64 = ADD64rr [[MOV64rm3]], [[MOV64rm1]], implicit-def dead $eflags + ; CHECK-NEXT: [[COPY10:%[0-9]+]].sub_32bit:gr64_nosp = ADD32rr [[COPY10]].sub_32bit, [[LEA64_32r1]], implicit-def dead $eflags + ; CHECK-NEXT: CMP64rr [[MOV64rm4]], [[MOVSX64rm32_1]], implicit-def $eflags ; CHECK-NEXT: JCC_1 %bb.6, 12, implicit $eflags ; CHECK-NEXT: JMP_1 %bb.5 entry: diff --git a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll index bf6b096..b428ce4 100644 --- a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll +++ b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll @@ -136,8 +136,6 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: movl (%r8), %edx ; CHECK-NEXT: leal 8(,%rbx,8), %eax ; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill -; CHECK-NEXT: leaq 8(%rsi), %rax -; CHECK-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill ; CHECK-NEXT: leaq 32(%rsi), %r11 ; CHECK-NEXT: leaq 8(,%rbx,8), %rbx ; CHECK-NEXT: xorl %r14d, %r14d @@ -189,7 +187,8 @@ define void @_Z2x6v() local_unnamed_addr { ; CHECK-NEXT: jae .LBB1_7 ; CHECK-NEXT: # %bb.6: # %vector.memcheck ; CHECK-NEXT: # in Loop: Header=BB1_2 Depth=1 -; CHECK-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload +; CHECK-NEXT: leaq 8(%rsi), %r9 +; CHECK-NEXT: addq %r9, %rax ; CHECK-NEXT: leaq (%rax,%r10,8), %rax ; CHECK-NEXT: cmpq %r15, %rax ; CHECK-NEXT: ja .LBB1_14 diff --git a/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir b/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir index 10ee445..d355374 100644 --- a/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir +++ b/llvm/test/CodeGen/X86/delete-dead-instrs-with-live-uses.mir @@ -7,8 +7,8 @@ # CHECK: jne # CHECK: andl $-16, %edx # CHECK: xorl %ebx, %ebx -# CHECK: movl -16(%ebp), %esi -# CHECK: xorl %eax, %eax +# CHECK: xorl %esi, %esi +# CHECK: movl %eax, %ecx name: test tracksRegLiveness: true diff --git a/llvm/test/CodeGen/X86/inalloca-invoke.ll b/llvm/test/CodeGen/X86/inalloca-invoke.ll index c2728f7..68cb24d 100644 --- a/llvm/test/CodeGen/X86/inalloca-invoke.ll +++ b/llvm/test/CodeGen/X86/inalloca-invoke.ll @@ -23,7 +23,6 @@ blah: ; CHECK: pushl %eax ; CHECK: subl $20, %esp ; CHECK: movl %esp, %[[beg:[^ ]*]] -; CHECK: leal 12(%[[beg]]), %[[end:[^ ]*]] call void @begin(ptr sret(%Iter) %temp.lvalue) ; CHECK: calll _begin @@ -32,6 +31,7 @@ blah: to label %invoke.cont unwind label %lpad ; Uses end as sret param. +; CHECK: leal 12(%[[beg]]), %[[end:[^ ]*]] ; CHECK: pushl %[[end]] ; CHECK: calll _plus diff --git a/llvm/test/CodeGen/X86/licm-regpressure.ll b/llvm/test/CodeGen/X86/licm-regpressure.ll index 72a4832..26ed2a3 100644 --- a/llvm/test/CodeGen/X86/licm-regpressure.ll +++ b/llvm/test/CodeGen/X86/licm-regpressure.ll @@ -1,14 +1,64 @@ -; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -; This tests currently fails as MachineLICM does not compute register pressure -; correctly. More details: llvm.org/PR23143 -; XFAIL: * +; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 6 +; RUN: llc < %s -mtriple=x86_64-linux -stop-after=early-machinelicm -o - | FileCheck %s -; MachineLICM should take register pressure into account. -; CHECK-NOT: Spill +; FIXME: MachineLICM does not compute register pressure correctly and we end up +; emitting too many ADD64ri32s. More details: llvm.org/PR23143 %struct.A = type { i32, i32, i32, i32, i32, i32, i32 } define void @test(i1 %b, ptr %a) nounwind { + ; CHECK-LABEL: name: test + ; CHECK: bb.0.entry: + ; CHECK-NEXT: successors: %bb.1(0x80000000) + ; CHECK-NEXT: liveins: $edi, $rsi + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: [[COPY:%[0-9]+]]:gr64 = COPY $rsi + ; CHECK-NEXT: [[COPY1:%[0-9]+]]:gr32 = COPY $edi + ; CHECK-NEXT: [[COPY2:%[0-9]+]]:gr8 = COPY [[COPY1]].sub_8bit + ; CHECK-NEXT: [[ADD64ri32_:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 4, implicit-def dead $eflags + ; CHECK-NEXT: [[ADD64ri32_1:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 8, implicit-def dead $eflags + ; CHECK-NEXT: [[ADD64ri32_2:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 12, implicit-def dead $eflags + ; CHECK-NEXT: [[ADD64ri32_3:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 16, implicit-def dead $eflags + ; CHECK-NEXT: [[ADD64ri32_4:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 20, implicit-def dead $eflags + ; CHECK-NEXT: [[ADD64ri32_5:%[0-9]+]]:gr64 = nuw ADD64ri32 [[COPY]], 24, implicit-def dead $eflags + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.1.loop-body: + ; CHECK-NEXT: successors: %bb.1(0x7c000000), %bb.2(0x04000000) + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: $rdi = COPY [[COPY]] + ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp + ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: $rdi = COPY [[ADD64ri32_]] + ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp + ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: $rdi = COPY [[ADD64ri32_1]] + ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp + ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: $rdi = COPY [[ADD64ri32_2]] + ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp + ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: $rdi = COPY [[ADD64ri32_3]] + ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp + ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: $rdi = COPY [[ADD64ri32_4]] + ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp + ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: ADJCALLSTACKDOWN64 0, 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: $rdi = COPY [[ADD64ri32_5]] + ; CHECK-NEXT: CALL64pcrel32 target-flags(x86-plt) @assign, csr_64, implicit $rsp, implicit $ssp, implicit $rdi, implicit-def $rsp, implicit-def $ssp + ; CHECK-NEXT: ADJCALLSTACKUP64 0, 0, implicit-def dead $rsp, implicit-def dead $eflags, implicit-def dead $ssp, implicit $rsp, implicit $ssp + ; CHECK-NEXT: TEST8ri [[COPY2]], 1, implicit-def $eflags + ; CHECK-NEXT: JCC_1 %bb.1, 5, implicit $eflags + ; CHECK-NEXT: JMP_1 %bb.2 + ; CHECK-NEXT: {{ $}} + ; CHECK-NEXT: bb.2.loop-exit: + ; CHECK-NEXT: RET 0 entry: br label %loop-header |