diff options
Diffstat (limited to 'llvm/test')
25 files changed, 3490 insertions, 972 deletions
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll index 41f7ab8..480fcbd 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll @@ -4992,28 +4992,21 @@ define void @test_shl_i512_const_32(ptr %result, ptr %input) { ; GISEL-LABEL: test_shl_i512_const_32: ; GISEL: ; %bb.0: ; %entry ; GISEL-NEXT: ldp x8, x9, [x1] -; GISEL-NEXT: ldp x11, x12, [x1, #16] -; GISEL-NEXT: ldp x14, x15, [x1, #32] -; GISEL-NEXT: lsr x10, x8, #32 -; GISEL-NEXT: lsr x13, x9, #32 -; GISEL-NEXT: lsl x8, x8, #32 -; GISEL-NEXT: orr x9, x10, x9, lsl #32 -; GISEL-NEXT: lsr x10, x11, #32 -; GISEL-NEXT: orr x11, x13, x11, lsl #32 -; GISEL-NEXT: ldp x13, x16, [x1, #48] -; GISEL-NEXT: stp x8, x9, [x0] -; GISEL-NEXT: lsr x8, x12, #32 -; GISEL-NEXT: orr x10, x10, x12, lsl #32 -; GISEL-NEXT: lsr x12, x14, #32 -; GISEL-NEXT: lsr x9, x15, #32 -; GISEL-NEXT: orr x8, x8, x14, lsl #32 -; GISEL-NEXT: stp x11, x10, [x0, #16] -; GISEL-NEXT: orr x11, x12, x15, lsl #32 -; GISEL-NEXT: lsr x12, x13, #32 -; GISEL-NEXT: orr x9, x9, x13, lsl #32 -; GISEL-NEXT: stp x8, x11, [x0, #32] -; GISEL-NEXT: orr x8, x12, x16, lsl #32 -; GISEL-NEXT: stp x9, x8, [x0, #48] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x13, x14, [x1, #32] +; GISEL-NEXT: lsl x12, x8, #32 +; GISEL-NEXT: extr x8, x9, x8, #32 +; GISEL-NEXT: extr x9, x10, x9, #32 +; GISEL-NEXT: extr x10, x11, x10, #32 +; GISEL-NEXT: ldp x15, x16, [x1, #48] +; GISEL-NEXT: stp x12, x8, [x0] +; GISEL-NEXT: extr x8, x13, x11, #32 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: extr x9, x14, x13, #32 +; GISEL-NEXT: extr x10, x15, x14, #32 +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: extr x8, x16, x15, #32 +; GISEL-NEXT: stp x10, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5044,30 +5037,22 @@ define void @test_lshr_i512_const_32(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_32: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x14, [x1, #24] -; GISEL-NEXT: ldr x16, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #32 -; GISEL-NEXT: lsl x13, x9, #32 -; GISEL-NEXT: lsl x15, x10, #32 -; GISEL-NEXT: orr x11, x12, x11, lsr #32 -; GISEL-NEXT: orr x8, x13, x8, lsr #32 -; GISEL-NEXT: lsl x13, x14, #32 -; GISEL-NEXT: orr x9, x15, x9, lsr #32 -; GISEL-NEXT: ldp x12, x15, [x1, #40] -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: orr x10, x13, x10, lsr #32 -; GISEL-NEXT: lsl x8, x16, #32 -; GISEL-NEXT: lsl x11, x12, #32 -; GISEL-NEXT: lsl x13, x15, #32 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x8, x8, x15, lsr #32 -; GISEL-NEXT: lsr x10, x16, #32 -; GISEL-NEXT: orr x11, x11, x14, lsr #32 -; GISEL-NEXT: orr x9, x13, x12, lsr #32 -; GISEL-NEXT: stp x8, x10, [x0, #48] -; GISEL-NEXT: stp x11, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: extr x8, x9, x8, #32 +; GISEL-NEXT: ldp x14, x15, [x1, #48] +; GISEL-NEXT: extr x9, x10, x9, #32 +; GISEL-NEXT: extr x10, x11, x10, #32 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #32 +; GISEL-NEXT: extr x9, x13, x12, #32 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #32 +; GISEL-NEXT: extr x8, x15, x14, #32 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: lsr x9, x15, #32 +; GISEL-NEXT: stp x8, x9, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5098,32 +5083,24 @@ define void @test_ashr_i512_const_32(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_32: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x13, [x1, #24] -; GISEL-NEXT: ldr x17, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #32 -; GISEL-NEXT: lsl x15, x9, #32 -; GISEL-NEXT: lsl x16, x10, #32 -; GISEL-NEXT: orr x11, x12, x11, lsr #32 -; GISEL-NEXT: ldp x14, x12, [x1, #40] -; GISEL-NEXT: orr x8, x15, x8, lsr #32 -; GISEL-NEXT: lsl x15, x13, #32 -; GISEL-NEXT: orr x9, x16, x9, lsr #32 -; GISEL-NEXT: asr x16, x17, #63 -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: lsl x11, x14, #32 -; GISEL-NEXT: orr x10, x15, x10, lsr #32 -; GISEL-NEXT: lsl x15, x12, #32 -; GISEL-NEXT: orr x8, x11, x13, lsr #32 -; GISEL-NEXT: lsl x11, x17, #32 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x9, x15, x14, lsr #32 -; GISEL-NEXT: lsl x13, x16, #32 -; GISEL-NEXT: orr x10, x11, x12, lsr #32 -; GISEL-NEXT: stp x8, x9, [x0, #32] -; GISEL-NEXT: orr x8, x13, x17, asr #32 -; GISEL-NEXT: stp x10, x8, [x0, #48] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #48] +; GISEL-NEXT: extr x8, x9, x8, #32 +; GISEL-NEXT: ldp x14, x15, [x1, #32] +; GISEL-NEXT: extr x9, x10, x9, #32 +; GISEL-NEXT: extr x10, x11, x10, #32 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: asr x8, x13, #63 +; GISEL-NEXT: extr x11, x14, x11, #32 +; GISEL-NEXT: extr x9, x15, x14, #32 +; GISEL-NEXT: lsl x8, x8, #32 +; GISEL-NEXT: stp x10, x11, [x0, #16] +; GISEL-NEXT: extr x10, x12, x15, #32 +; GISEL-NEXT: extr x11, x13, x12, #32 +; GISEL-NEXT: orr x8, x8, x13, asr #32 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x11, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5252,23 +5229,17 @@ define void @test_shl_i512_const_96(ptr %result, ptr %input) { ; GISEL-NEXT: ldr x15, [x1, #48] ; GISEL-NEXT: ldp x10, x11, [x1, #16] ; GISEL-NEXT: ldp x12, x13, [x1, #32] -; GISEL-NEXT: lsr x14, x8, #32 -; GISEL-NEXT: lsr x16, x9, #32 -; GISEL-NEXT: lsl x8, x8, #32 -; GISEL-NEXT: orr x9, x14, x9, lsl #32 -; GISEL-NEXT: lsr x14, x10, #32 -; GISEL-NEXT: orr x10, x16, x10, lsl #32 -; GISEL-NEXT: stp xzr, x8, [x0] -; GISEL-NEXT: lsr x8, x11, #32 -; GISEL-NEXT: orr x11, x14, x11, lsl #32 -; GISEL-NEXT: lsr x14, x12, #32 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: lsr x9, x13, #32 -; GISEL-NEXT: orr x8, x8, x12, lsl #32 -; GISEL-NEXT: orr x10, x14, x13, lsl #32 -; GISEL-NEXT: orr x9, x9, x15, lsl #32 -; GISEL-NEXT: stp x11, x8, [x0, #32] -; GISEL-NEXT: stp x10, x9, [x0, #48] +; GISEL-NEXT: lsl x14, x8, #32 +; GISEL-NEXT: extr x8, x9, x8, #32 +; GISEL-NEXT: extr x9, x10, x9, #32 +; GISEL-NEXT: extr x10, x11, x10, #32 +; GISEL-NEXT: stp xzr, x14, [x0] +; GISEL-NEXT: stp x8, x9, [x0, #16] +; GISEL-NEXT: extr x8, x12, x11, #32 +; GISEL-NEXT: extr x9, x13, x12, #32 +; GISEL-NEXT: stp x10, x8, [x0, #32] +; GISEL-NEXT: extr x10, x15, x13, #32 +; GISEL-NEXT: stp x9, x10, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5297,27 +5268,21 @@ define void @test_lshr_i512_const_96(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_96: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x10, [x1, #8] -; GISEL-NEXT: ldp x11, x14, [x1, #32] -; GISEL-NEXT: ldp x15, x16, [x1, #48] -; GISEL-NEXT: lsl x12, x8, #32 -; GISEL-NEXT: lsl x13, x9, #32 -; GISEL-NEXT: orr x10, x12, x10, lsr #32 -; GISEL-NEXT: lsl x12, x11, #32 -; GISEL-NEXT: orr x8, x13, x8, lsr #32 -; GISEL-NEXT: lsl x13, x14, #32 -; GISEL-NEXT: orr x9, x12, x9, lsr #32 -; GISEL-NEXT: stp x10, x8, [x0] -; GISEL-NEXT: lsl x10, x15, #32 -; GISEL-NEXT: orr x11, x13, x11, lsr #32 -; GISEL-NEXT: lsl x12, x16, #32 -; GISEL-NEXT: orr x8, x10, x14, lsr #32 -; GISEL-NEXT: lsr x10, x16, #32 -; GISEL-NEXT: stp x9, x11, [x0, #16] -; GISEL-NEXT: orr x9, x12, x15, lsr #32 -; GISEL-NEXT: stp x10, xzr, [x0, #48] -; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x14, [x1, #56] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x12, x13, [x1, #40] +; GISEL-NEXT: extr x8, x9, x8, #32 +; GISEL-NEXT: extr x9, x10, x9, #32 +; GISEL-NEXT: extr x10, x11, x10, #32 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #32 +; GISEL-NEXT: extr x9, x13, x12, #32 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #32 +; GISEL-NEXT: lsr x8, x14, #32 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, xzr, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5347,29 +5312,23 @@ define void @test_ashr_i512_const_96(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_96: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x11, [x1, #8] -; GISEL-NEXT: ldp x10, x13, [x1, #32] -; GISEL-NEXT: lsl x12, x8, #32 -; GISEL-NEXT: lsl x14, x9, #32 -; GISEL-NEXT: lsl x15, x10, #32 -; GISEL-NEXT: orr x11, x12, x11, lsr #32 -; GISEL-NEXT: ldp x12, x16, [x1, #48] -; GISEL-NEXT: orr x8, x14, x8, lsr #32 -; GISEL-NEXT: lsl x14, x13, #32 -; GISEL-NEXT: orr x9, x15, x9, lsr #32 -; GISEL-NEXT: asr x15, x16, #63 -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: lsl x11, x12, #32 -; GISEL-NEXT: orr x10, x14, x10, lsr #32 -; GISEL-NEXT: lsl x14, x16, #32 -; GISEL-NEXT: orr x8, x11, x13, lsr #32 +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x13, [x1, #40] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x14, x12, [x1, #48] +; GISEL-NEXT: extr x8, x9, x8, #32 +; GISEL-NEXT: extr x9, x10, x9, #32 +; GISEL-NEXT: extr x10, x11, x10, #32 +; GISEL-NEXT: asr x15, x12, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x13, x11, #32 +; GISEL-NEXT: extr x9, x14, x13, #32 ; GISEL-NEXT: lsl x11, x15, #32 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x9, x14, x12, lsr #32 -; GISEL-NEXT: orr x10, x11, x16, asr #32 -; GISEL-NEXT: stp x8, x9, [x0, #32] -; GISEL-NEXT: stp x10, x15, [x0, #48] +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x12, x14, #32 +; GISEL-NEXT: orr x8, x11, x12, asr #32 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, x15, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5404,28 +5363,21 @@ define void @test_shl_i512_const_1(ptr %result, ptr %input) { ; GISEL-LABEL: test_shl_i512_const_1: ; GISEL: ; %bb.0: ; %entry ; GISEL-NEXT: ldp x8, x9, [x1] -; GISEL-NEXT: ldp x11, x12, [x1, #16] -; GISEL-NEXT: ldp x14, x15, [x1, #32] -; GISEL-NEXT: lsr x10, x8, #63 -; GISEL-NEXT: lsr x13, x9, #63 -; GISEL-NEXT: lsl x8, x8, #1 -; GISEL-NEXT: orr x9, x10, x9, lsl #1 -; GISEL-NEXT: lsr x10, x11, #63 -; GISEL-NEXT: orr x11, x13, x11, lsl #1 -; GISEL-NEXT: ldp x13, x16, [x1, #48] -; GISEL-NEXT: stp x8, x9, [x0] -; GISEL-NEXT: lsr x8, x12, #63 -; GISEL-NEXT: orr x10, x10, x12, lsl #1 -; GISEL-NEXT: lsr x12, x14, #63 -; GISEL-NEXT: lsr x9, x15, #63 -; GISEL-NEXT: orr x8, x8, x14, lsl #1 -; GISEL-NEXT: stp x11, x10, [x0, #16] -; GISEL-NEXT: orr x11, x12, x15, lsl #1 -; GISEL-NEXT: lsr x12, x13, #63 -; GISEL-NEXT: orr x9, x9, x13, lsl #1 -; GISEL-NEXT: stp x8, x11, [x0, #32] -; GISEL-NEXT: orr x8, x12, x16, lsl #1 -; GISEL-NEXT: stp x9, x8, [x0, #48] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x13, x14, [x1, #32] +; GISEL-NEXT: lsl x12, x8, #1 +; GISEL-NEXT: extr x8, x9, x8, #63 +; GISEL-NEXT: extr x9, x10, x9, #63 +; GISEL-NEXT: extr x10, x11, x10, #63 +; GISEL-NEXT: ldp x15, x16, [x1, #48] +; GISEL-NEXT: stp x12, x8, [x0] +; GISEL-NEXT: extr x8, x13, x11, #63 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: extr x9, x14, x13, #63 +; GISEL-NEXT: extr x10, x15, x14, #63 +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: extr x8, x16, x15, #63 +; GISEL-NEXT: stp x10, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5457,30 +5409,22 @@ define void @test_lshr_i512_const_1(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_1: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x14, [x1, #24] -; GISEL-NEXT: ldr x16, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #63 -; GISEL-NEXT: lsl x13, x9, #63 -; GISEL-NEXT: lsl x15, x10, #63 -; GISEL-NEXT: orr x11, x12, x11, lsr #1 -; GISEL-NEXT: orr x8, x13, x8, lsr #1 -; GISEL-NEXT: lsl x13, x14, #63 -; GISEL-NEXT: orr x9, x15, x9, lsr #1 -; GISEL-NEXT: ldp x12, x15, [x1, #40] -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: orr x10, x13, x10, lsr #1 -; GISEL-NEXT: lsl x8, x16, #63 -; GISEL-NEXT: lsl x11, x12, #63 -; GISEL-NEXT: lsl x13, x15, #63 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x8, x8, x15, lsr #1 -; GISEL-NEXT: lsr x10, x16, #1 -; GISEL-NEXT: orr x11, x11, x14, lsr #1 -; GISEL-NEXT: orr x9, x13, x12, lsr #1 -; GISEL-NEXT: stp x8, x10, [x0, #48] -; GISEL-NEXT: stp x11, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: extr x8, x9, x8, #1 +; GISEL-NEXT: ldp x14, x15, [x1, #48] +; GISEL-NEXT: extr x9, x10, x9, #1 +; GISEL-NEXT: extr x10, x11, x10, #1 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #1 +; GISEL-NEXT: extr x9, x13, x12, #1 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #1 +; GISEL-NEXT: extr x8, x15, x14, #1 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: lsr x9, x15, #1 +; GISEL-NEXT: stp x8, x9, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5512,32 +5456,24 @@ define void @test_ashr_i512_const_1(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_1: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x13, [x1, #24] -; GISEL-NEXT: ldr x17, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #63 -; GISEL-NEXT: lsl x15, x9, #63 -; GISEL-NEXT: lsl x16, x10, #63 -; GISEL-NEXT: orr x11, x12, x11, lsr #1 -; GISEL-NEXT: ldp x14, x12, [x1, #40] -; GISEL-NEXT: orr x8, x15, x8, lsr #1 -; GISEL-NEXT: lsl x15, x13, #63 -; GISEL-NEXT: orr x9, x16, x9, lsr #1 -; GISEL-NEXT: asr x16, x17, #63 -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: lsl x11, x14, #63 -; GISEL-NEXT: orr x10, x15, x10, lsr #1 -; GISEL-NEXT: lsl x15, x12, #63 -; GISEL-NEXT: orr x8, x11, x13, lsr #1 -; GISEL-NEXT: lsl x11, x17, #63 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x9, x15, x14, lsr #1 -; GISEL-NEXT: lsl x13, x16, #63 -; GISEL-NEXT: orr x10, x11, x12, lsr #1 -; GISEL-NEXT: stp x8, x9, [x0, #32] -; GISEL-NEXT: orr x8, x13, x17, asr #1 -; GISEL-NEXT: stp x10, x8, [x0, #48] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #48] +; GISEL-NEXT: extr x8, x9, x8, #1 +; GISEL-NEXT: ldp x14, x15, [x1, #32] +; GISEL-NEXT: extr x9, x10, x9, #1 +; GISEL-NEXT: extr x10, x11, x10, #1 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: asr x8, x13, #63 +; GISEL-NEXT: extr x11, x14, x11, #1 +; GISEL-NEXT: extr x9, x15, x14, #1 +; GISEL-NEXT: lsl x8, x8, #63 +; GISEL-NEXT: stp x10, x11, [x0, #16] +; GISEL-NEXT: extr x10, x12, x15, #1 +; GISEL-NEXT: extr x11, x13, x12, #1 +; GISEL-NEXT: orr x8, x8, x13, asr #1 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x11, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5571,28 +5507,21 @@ define void @test_shl_i512_const_15(ptr %result, ptr %input) { ; GISEL-LABEL: test_shl_i512_const_15: ; GISEL: ; %bb.0: ; %entry ; GISEL-NEXT: ldp x8, x9, [x1] -; GISEL-NEXT: ldp x11, x12, [x1, #16] -; GISEL-NEXT: ldp x14, x15, [x1, #32] -; GISEL-NEXT: lsr x10, x8, #49 -; GISEL-NEXT: lsr x13, x9, #49 -; GISEL-NEXT: lsl x8, x8, #15 -; GISEL-NEXT: orr x9, x10, x9, lsl #15 -; GISEL-NEXT: lsr x10, x11, #49 -; GISEL-NEXT: orr x11, x13, x11, lsl #15 -; GISEL-NEXT: ldp x13, x16, [x1, #48] -; GISEL-NEXT: stp x8, x9, [x0] -; GISEL-NEXT: lsr x8, x12, #49 -; GISEL-NEXT: orr x10, x10, x12, lsl #15 -; GISEL-NEXT: lsr x12, x14, #49 -; GISEL-NEXT: lsr x9, x15, #49 -; GISEL-NEXT: orr x8, x8, x14, lsl #15 -; GISEL-NEXT: stp x11, x10, [x0, #16] -; GISEL-NEXT: orr x11, x12, x15, lsl #15 -; GISEL-NEXT: lsr x12, x13, #49 -; GISEL-NEXT: orr x9, x9, x13, lsl #15 -; GISEL-NEXT: stp x8, x11, [x0, #32] -; GISEL-NEXT: orr x8, x12, x16, lsl #15 -; GISEL-NEXT: stp x9, x8, [x0, #48] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x13, x14, [x1, #32] +; GISEL-NEXT: lsl x12, x8, #15 +; GISEL-NEXT: extr x8, x9, x8, #49 +; GISEL-NEXT: extr x9, x10, x9, #49 +; GISEL-NEXT: extr x10, x11, x10, #49 +; GISEL-NEXT: ldp x15, x16, [x1, #48] +; GISEL-NEXT: stp x12, x8, [x0] +; GISEL-NEXT: extr x8, x13, x11, #49 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: extr x9, x14, x13, #49 +; GISEL-NEXT: extr x10, x15, x14, #49 +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: extr x8, x16, x15, #49 +; GISEL-NEXT: stp x10, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5624,30 +5553,22 @@ define void @test_lshr_i512_const_15(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_15: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x14, [x1, #24] -; GISEL-NEXT: ldr x16, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #49 -; GISEL-NEXT: lsl x13, x9, #49 -; GISEL-NEXT: lsl x15, x10, #49 -; GISEL-NEXT: orr x11, x12, x11, lsr #15 -; GISEL-NEXT: orr x8, x13, x8, lsr #15 -; GISEL-NEXT: lsl x13, x14, #49 -; GISEL-NEXT: orr x9, x15, x9, lsr #15 -; GISEL-NEXT: ldp x12, x15, [x1, #40] -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: orr x10, x13, x10, lsr #15 -; GISEL-NEXT: lsl x8, x16, #49 -; GISEL-NEXT: lsl x11, x12, #49 -; GISEL-NEXT: lsl x13, x15, #49 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x8, x8, x15, lsr #15 -; GISEL-NEXT: lsr x10, x16, #15 -; GISEL-NEXT: orr x11, x11, x14, lsr #15 -; GISEL-NEXT: orr x9, x13, x12, lsr #15 -; GISEL-NEXT: stp x8, x10, [x0, #48] -; GISEL-NEXT: stp x11, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: extr x8, x9, x8, #15 +; GISEL-NEXT: ldp x14, x15, [x1, #48] +; GISEL-NEXT: extr x9, x10, x9, #15 +; GISEL-NEXT: extr x10, x11, x10, #15 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #15 +; GISEL-NEXT: extr x9, x13, x12, #15 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #15 +; GISEL-NEXT: extr x8, x15, x14, #15 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: lsr x9, x15, #15 +; GISEL-NEXT: stp x8, x9, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5679,32 +5600,24 @@ define void @test_ashr_i512_const_15(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_15: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x13, [x1, #24] -; GISEL-NEXT: ldr x17, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #49 -; GISEL-NEXT: lsl x15, x9, #49 -; GISEL-NEXT: lsl x16, x10, #49 -; GISEL-NEXT: orr x11, x12, x11, lsr #15 -; GISEL-NEXT: ldp x14, x12, [x1, #40] -; GISEL-NEXT: orr x8, x15, x8, lsr #15 -; GISEL-NEXT: lsl x15, x13, #49 -; GISEL-NEXT: orr x9, x16, x9, lsr #15 -; GISEL-NEXT: asr x16, x17, #63 -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: lsl x11, x14, #49 -; GISEL-NEXT: orr x10, x15, x10, lsr #15 -; GISEL-NEXT: lsl x15, x12, #49 -; GISEL-NEXT: orr x8, x11, x13, lsr #15 -; GISEL-NEXT: lsl x11, x17, #49 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x9, x15, x14, lsr #15 -; GISEL-NEXT: lsl x13, x16, #49 -; GISEL-NEXT: orr x10, x11, x12, lsr #15 -; GISEL-NEXT: stp x8, x9, [x0, #32] -; GISEL-NEXT: orr x8, x13, x17, asr #15 -; GISEL-NEXT: stp x10, x8, [x0, #48] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #48] +; GISEL-NEXT: extr x8, x9, x8, #15 +; GISEL-NEXT: ldp x14, x15, [x1, #32] +; GISEL-NEXT: extr x9, x10, x9, #15 +; GISEL-NEXT: extr x10, x11, x10, #15 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: asr x8, x13, #63 +; GISEL-NEXT: extr x11, x14, x11, #15 +; GISEL-NEXT: extr x9, x15, x14, #15 +; GISEL-NEXT: lsl x8, x8, #49 +; GISEL-NEXT: stp x10, x11, [x0, #16] +; GISEL-NEXT: extr x10, x12, x15, #15 +; GISEL-NEXT: extr x11, x13, x12, #15 +; GISEL-NEXT: orr x8, x8, x13, asr #15 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x11, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5738,28 +5651,21 @@ define void @test_shl_i512_const_63(ptr %result, ptr %input) { ; GISEL-LABEL: test_shl_i512_const_63: ; GISEL: ; %bb.0: ; %entry ; GISEL-NEXT: ldp x8, x9, [x1] -; GISEL-NEXT: ldp x11, x12, [x1, #16] -; GISEL-NEXT: ldp x14, x15, [x1, #32] -; GISEL-NEXT: lsr x10, x8, #1 -; GISEL-NEXT: lsr x13, x9, #1 -; GISEL-NEXT: lsl x8, x8, #63 -; GISEL-NEXT: orr x9, x10, x9, lsl #63 -; GISEL-NEXT: lsr x10, x11, #1 -; GISEL-NEXT: orr x11, x13, x11, lsl #63 -; GISEL-NEXT: ldp x13, x16, [x1, #48] -; GISEL-NEXT: stp x8, x9, [x0] -; GISEL-NEXT: lsr x8, x12, #1 -; GISEL-NEXT: orr x10, x10, x12, lsl #63 -; GISEL-NEXT: lsr x12, x14, #1 -; GISEL-NEXT: lsr x9, x15, #1 -; GISEL-NEXT: orr x8, x8, x14, lsl #63 -; GISEL-NEXT: stp x11, x10, [x0, #16] -; GISEL-NEXT: orr x11, x12, x15, lsl #63 -; GISEL-NEXT: lsr x12, x13, #1 -; GISEL-NEXT: orr x9, x9, x13, lsl #63 -; GISEL-NEXT: stp x8, x11, [x0, #32] -; GISEL-NEXT: orr x8, x12, x16, lsl #63 -; GISEL-NEXT: stp x9, x8, [x0, #48] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x13, x14, [x1, #32] +; GISEL-NEXT: lsl x12, x8, #63 +; GISEL-NEXT: extr x8, x9, x8, #1 +; GISEL-NEXT: extr x9, x10, x9, #1 +; GISEL-NEXT: extr x10, x11, x10, #1 +; GISEL-NEXT: ldp x15, x16, [x1, #48] +; GISEL-NEXT: stp x12, x8, [x0] +; GISEL-NEXT: extr x8, x13, x11, #1 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: extr x9, x14, x13, #1 +; GISEL-NEXT: extr x10, x15, x14, #1 +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: extr x8, x16, x15, #1 +; GISEL-NEXT: stp x10, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5791,30 +5697,22 @@ define void @test_lshr_i512_const_63(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_63: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x14, [x1, #24] -; GISEL-NEXT: ldr x16, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #1 -; GISEL-NEXT: lsl x13, x9, #1 -; GISEL-NEXT: lsl x15, x10, #1 -; GISEL-NEXT: orr x11, x12, x11, lsr #63 -; GISEL-NEXT: orr x8, x13, x8, lsr #63 -; GISEL-NEXT: lsl x13, x14, #1 -; GISEL-NEXT: orr x9, x15, x9, lsr #63 -; GISEL-NEXT: ldp x12, x15, [x1, #40] -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: orr x10, x13, x10, lsr #63 -; GISEL-NEXT: lsl x8, x16, #1 -; GISEL-NEXT: lsl x11, x12, #1 -; GISEL-NEXT: lsl x13, x15, #1 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x8, x8, x15, lsr #63 -; GISEL-NEXT: lsr x10, x16, #63 -; GISEL-NEXT: orr x11, x11, x14, lsr #63 -; GISEL-NEXT: orr x9, x13, x12, lsr #63 -; GISEL-NEXT: stp x8, x10, [x0, #48] -; GISEL-NEXT: stp x11, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: extr x8, x9, x8, #63 +; GISEL-NEXT: ldp x14, x15, [x1, #48] +; GISEL-NEXT: extr x9, x10, x9, #63 +; GISEL-NEXT: extr x10, x11, x10, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #63 +; GISEL-NEXT: extr x9, x13, x12, #63 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #63 +; GISEL-NEXT: extr x8, x15, x14, #63 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: lsr x9, x15, #63 +; GISEL-NEXT: stp x8, x9, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5846,30 +5744,22 @@ define void @test_ashr_i512_const_63(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_63: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x10, [x1] -; GISEL-NEXT: ldp x11, x13, [x1, #24] -; GISEL-NEXT: ldr x17, [x1, #56] -; GISEL-NEXT: lsl x15, x9, #1 -; GISEL-NEXT: lsl x12, x8, #1 -; GISEL-NEXT: lsl x16, x11, #1 -; GISEL-NEXT: orr x8, x15, x8, lsr #63 -; GISEL-NEXT: lsl x15, x13, #1 -; GISEL-NEXT: orr x10, x12, x10, lsr #63 -; GISEL-NEXT: ldp x14, x12, [x1, #40] -; GISEL-NEXT: orr x9, x16, x9, lsr #63 -; GISEL-NEXT: orr x11, x15, x11, lsr #63 -; GISEL-NEXT: stp x10, x8, [x0] -; GISEL-NEXT: lsl x8, x17, #1 -; GISEL-NEXT: lsl x16, x14, #1 -; GISEL-NEXT: lsl x10, x12, #1 -; GISEL-NEXT: stp x9, x11, [x0, #16] -; GISEL-NEXT: asr x9, x17, #63 -; GISEL-NEXT: orr x8, x8, x12, lsr #63 -; GISEL-NEXT: orr x13, x16, x13, lsr #63 -; GISEL-NEXT: orr x10, x10, x14, lsr #63 -; GISEL-NEXT: orr x9, x9, x9, lsl #1 -; GISEL-NEXT: stp x13, x10, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: extr x8, x9, x8, #63 +; GISEL-NEXT: ldp x14, x15, [x1, #48] +; GISEL-NEXT: extr x9, x10, x9, #63 +; GISEL-NEXT: extr x10, x11, x10, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #63 +; GISEL-NEXT: extr x9, x13, x12, #63 +; GISEL-NEXT: extr x11, x14, x13, #63 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: asr x10, x15, #63 +; GISEL-NEXT: extr x8, x15, x14, #63 +; GISEL-NEXT: stp x9, x11, [x0, #32] +; GISEL-NEXT: orr x9, x10, x10, lsl #1 ; GISEL-NEXT: stp x8, x9, [x0, #48] ; GISEL-NEXT: ret entry: @@ -5906,23 +5796,17 @@ define void @test_shl_i512_const_65(ptr %result, ptr %input) { ; GISEL-NEXT: ldr x15, [x1, #48] ; GISEL-NEXT: ldp x10, x11, [x1, #16] ; GISEL-NEXT: ldp x12, x13, [x1, #32] -; GISEL-NEXT: lsr x14, x8, #63 -; GISEL-NEXT: lsr x16, x9, #63 -; GISEL-NEXT: lsl x8, x8, #1 -; GISEL-NEXT: orr x9, x14, x9, lsl #1 -; GISEL-NEXT: lsr x14, x10, #63 -; GISEL-NEXT: orr x10, x16, x10, lsl #1 -; GISEL-NEXT: stp xzr, x8, [x0] -; GISEL-NEXT: lsr x8, x11, #63 -; GISEL-NEXT: orr x11, x14, x11, lsl #1 -; GISEL-NEXT: lsr x14, x12, #63 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: lsr x9, x13, #63 -; GISEL-NEXT: orr x8, x8, x12, lsl #1 -; GISEL-NEXT: orr x10, x14, x13, lsl #1 -; GISEL-NEXT: orr x9, x9, x15, lsl #1 -; GISEL-NEXT: stp x11, x8, [x0, #32] -; GISEL-NEXT: stp x10, x9, [x0, #48] +; GISEL-NEXT: lsl x14, x8, #1 +; GISEL-NEXT: extr x8, x9, x8, #63 +; GISEL-NEXT: extr x9, x10, x9, #63 +; GISEL-NEXT: extr x10, x11, x10, #63 +; GISEL-NEXT: stp xzr, x14, [x0] +; GISEL-NEXT: stp x8, x9, [x0, #16] +; GISEL-NEXT: extr x8, x12, x11, #63 +; GISEL-NEXT: extr x9, x13, x12, #63 +; GISEL-NEXT: stp x10, x8, [x0, #32] +; GISEL-NEXT: extr x10, x15, x13, #63 +; GISEL-NEXT: stp x9, x10, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5953,27 +5837,21 @@ define void @test_lshr_i512_const_65(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_65: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x10, [x1, #8] -; GISEL-NEXT: ldp x11, x14, [x1, #32] -; GISEL-NEXT: ldp x15, x16, [x1, #48] -; GISEL-NEXT: lsl x12, x8, #63 -; GISEL-NEXT: lsl x13, x9, #63 -; GISEL-NEXT: orr x10, x12, x10, lsr #1 -; GISEL-NEXT: lsl x12, x11, #63 -; GISEL-NEXT: orr x8, x13, x8, lsr #1 -; GISEL-NEXT: lsl x13, x14, #63 -; GISEL-NEXT: orr x9, x12, x9, lsr #1 -; GISEL-NEXT: stp x10, x8, [x0] -; GISEL-NEXT: lsl x10, x15, #63 -; GISEL-NEXT: orr x11, x13, x11, lsr #1 -; GISEL-NEXT: lsl x12, x16, #63 -; GISEL-NEXT: orr x8, x10, x14, lsr #1 -; GISEL-NEXT: lsr x10, x16, #1 -; GISEL-NEXT: stp x9, x11, [x0, #16] -; GISEL-NEXT: orr x9, x12, x15, lsr #1 -; GISEL-NEXT: stp x10, xzr, [x0, #48] -; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x14, [x1, #56] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x12, x13, [x1, #40] +; GISEL-NEXT: extr x8, x9, x8, #1 +; GISEL-NEXT: extr x9, x10, x9, #1 +; GISEL-NEXT: extr x10, x11, x10, #1 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #1 +; GISEL-NEXT: extr x9, x13, x12, #1 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #1 +; GISEL-NEXT: lsr x8, x14, #1 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, xzr, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6005,29 +5883,23 @@ define void @test_ashr_i512_const_65(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_65: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x11, [x1, #8] -; GISEL-NEXT: ldp x10, x13, [x1, #32] -; GISEL-NEXT: lsl x12, x8, #63 -; GISEL-NEXT: lsl x14, x9, #63 -; GISEL-NEXT: lsl x15, x10, #63 -; GISEL-NEXT: orr x11, x12, x11, lsr #1 -; GISEL-NEXT: ldp x12, x16, [x1, #48] -; GISEL-NEXT: orr x8, x14, x8, lsr #1 -; GISEL-NEXT: lsl x14, x13, #63 -; GISEL-NEXT: orr x9, x15, x9, lsr #1 -; GISEL-NEXT: asr x15, x16, #63 -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: lsl x11, x12, #63 -; GISEL-NEXT: orr x10, x14, x10, lsr #1 -; GISEL-NEXT: lsl x14, x16, #63 -; GISEL-NEXT: orr x8, x11, x13, lsr #1 +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x13, [x1, #40] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x14, x12, [x1, #48] +; GISEL-NEXT: extr x8, x9, x8, #1 +; GISEL-NEXT: extr x9, x10, x9, #1 +; GISEL-NEXT: extr x10, x11, x10, #1 +; GISEL-NEXT: asr x15, x12, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x13, x11, #1 +; GISEL-NEXT: extr x9, x14, x13, #1 ; GISEL-NEXT: lsl x11, x15, #63 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x9, x14, x12, lsr #1 -; GISEL-NEXT: orr x10, x11, x16, asr #1 -; GISEL-NEXT: stp x8, x9, [x0, #32] -; GISEL-NEXT: stp x10, x15, [x0, #48] +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x12, x14, #1 +; GISEL-NEXT: orr x8, x11, x12, asr #1 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, x15, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6062,23 +5934,17 @@ define void @test_shl_i512_const_100(ptr %result, ptr %input) { ; GISEL-NEXT: ldr x15, [x1, #48] ; GISEL-NEXT: ldp x10, x11, [x1, #16] ; GISEL-NEXT: ldp x12, x13, [x1, #32] -; GISEL-NEXT: lsr x14, x8, #28 -; GISEL-NEXT: lsr x16, x9, #28 -; GISEL-NEXT: lsl x8, x8, #36 -; GISEL-NEXT: orr x9, x14, x9, lsl #36 -; GISEL-NEXT: lsr x14, x10, #28 -; GISEL-NEXT: orr x10, x16, x10, lsl #36 -; GISEL-NEXT: stp xzr, x8, [x0] -; GISEL-NEXT: lsr x8, x11, #28 -; GISEL-NEXT: orr x11, x14, x11, lsl #36 -; GISEL-NEXT: lsr x14, x12, #28 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: lsr x9, x13, #28 -; GISEL-NEXT: orr x8, x8, x12, lsl #36 -; GISEL-NEXT: orr x10, x14, x13, lsl #36 -; GISEL-NEXT: orr x9, x9, x15, lsl #36 -; GISEL-NEXT: stp x11, x8, [x0, #32] -; GISEL-NEXT: stp x10, x9, [x0, #48] +; GISEL-NEXT: lsl x14, x8, #36 +; GISEL-NEXT: extr x8, x9, x8, #28 +; GISEL-NEXT: extr x9, x10, x9, #28 +; GISEL-NEXT: extr x10, x11, x10, #28 +; GISEL-NEXT: stp xzr, x14, [x0] +; GISEL-NEXT: stp x8, x9, [x0, #16] +; GISEL-NEXT: extr x8, x12, x11, #28 +; GISEL-NEXT: extr x9, x13, x12, #28 +; GISEL-NEXT: stp x10, x8, [x0, #32] +; GISEL-NEXT: extr x10, x15, x13, #28 +; GISEL-NEXT: stp x9, x10, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6109,27 +5975,21 @@ define void @test_lshr_i512_const_100(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_100: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x10, [x1, #8] -; GISEL-NEXT: ldp x11, x14, [x1, #32] -; GISEL-NEXT: ldp x15, x16, [x1, #48] -; GISEL-NEXT: lsl x12, x8, #28 -; GISEL-NEXT: lsl x13, x9, #28 -; GISEL-NEXT: orr x10, x12, x10, lsr #36 -; GISEL-NEXT: lsl x12, x11, #28 -; GISEL-NEXT: orr x8, x13, x8, lsr #36 -; GISEL-NEXT: lsl x13, x14, #28 -; GISEL-NEXT: orr x9, x12, x9, lsr #36 -; GISEL-NEXT: stp x10, x8, [x0] -; GISEL-NEXT: lsl x10, x15, #28 -; GISEL-NEXT: orr x11, x13, x11, lsr #36 -; GISEL-NEXT: lsl x12, x16, #28 -; GISEL-NEXT: orr x8, x10, x14, lsr #36 -; GISEL-NEXT: lsr x10, x16, #36 -; GISEL-NEXT: stp x9, x11, [x0, #16] -; GISEL-NEXT: orr x9, x12, x15, lsr #36 -; GISEL-NEXT: stp x10, xzr, [x0, #48] -; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x14, [x1, #56] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x12, x13, [x1, #40] +; GISEL-NEXT: extr x8, x9, x8, #36 +; GISEL-NEXT: extr x9, x10, x9, #36 +; GISEL-NEXT: extr x10, x11, x10, #36 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #36 +; GISEL-NEXT: extr x9, x13, x12, #36 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #36 +; GISEL-NEXT: lsr x8, x14, #36 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, xzr, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6161,29 +6021,23 @@ define void @test_ashr_i512_const_100(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_100: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x11, [x1, #8] -; GISEL-NEXT: ldp x10, x13, [x1, #32] -; GISEL-NEXT: lsl x12, x8, #28 -; GISEL-NEXT: lsl x14, x9, #28 -; GISEL-NEXT: lsl x15, x10, #28 -; GISEL-NEXT: orr x11, x12, x11, lsr #36 -; GISEL-NEXT: ldp x12, x16, [x1, #48] -; GISEL-NEXT: orr x8, x14, x8, lsr #36 -; GISEL-NEXT: lsl x14, x13, #28 -; GISEL-NEXT: orr x9, x15, x9, lsr #36 -; GISEL-NEXT: asr x15, x16, #63 -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: lsl x11, x12, #28 -; GISEL-NEXT: orr x10, x14, x10, lsr #36 -; GISEL-NEXT: lsl x14, x16, #28 -; GISEL-NEXT: orr x8, x11, x13, lsr #36 +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x13, [x1, #40] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x14, x12, [x1, #48] +; GISEL-NEXT: extr x8, x9, x8, #36 +; GISEL-NEXT: extr x9, x10, x9, #36 +; GISEL-NEXT: extr x10, x11, x10, #36 +; GISEL-NEXT: asr x15, x12, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x13, x11, #36 +; GISEL-NEXT: extr x9, x14, x13, #36 ; GISEL-NEXT: lsl x11, x15, #28 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x9, x14, x12, lsr #36 -; GISEL-NEXT: orr x10, x11, x16, asr #36 -; GISEL-NEXT: stp x8, x9, [x0, #32] -; GISEL-NEXT: stp x10, x15, [x0, #48] +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x12, x14, #36 +; GISEL-NEXT: orr x8, x11, x12, asr #36 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, x15, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6219,23 +6073,17 @@ define void @test_shl_i512_const_127(ptr %result, ptr %input) { ; GISEL-NEXT: ldr x15, [x1, #48] ; GISEL-NEXT: ldp x10, x11, [x1, #16] ; GISEL-NEXT: ldp x12, x13, [x1, #32] -; GISEL-NEXT: lsr x14, x8, #1 -; GISEL-NEXT: lsr x16, x9, #1 -; GISEL-NEXT: lsl x8, x8, #63 -; GISEL-NEXT: orr x9, x14, x9, lsl #63 -; GISEL-NEXT: lsr x14, x10, #1 -; GISEL-NEXT: orr x10, x16, x10, lsl #63 -; GISEL-NEXT: stp xzr, x8, [x0] -; GISEL-NEXT: lsr x8, x11, #1 -; GISEL-NEXT: orr x11, x14, x11, lsl #63 -; GISEL-NEXT: lsr x14, x12, #1 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: lsr x9, x13, #1 -; GISEL-NEXT: orr x8, x8, x12, lsl #63 -; GISEL-NEXT: orr x10, x14, x13, lsl #63 -; GISEL-NEXT: orr x9, x9, x15, lsl #63 -; GISEL-NEXT: stp x11, x8, [x0, #32] -; GISEL-NEXT: stp x10, x9, [x0, #48] +; GISEL-NEXT: lsl x14, x8, #63 +; GISEL-NEXT: extr x8, x9, x8, #1 +; GISEL-NEXT: extr x9, x10, x9, #1 +; GISEL-NEXT: extr x10, x11, x10, #1 +; GISEL-NEXT: stp xzr, x14, [x0] +; GISEL-NEXT: stp x8, x9, [x0, #16] +; GISEL-NEXT: extr x8, x12, x11, #1 +; GISEL-NEXT: extr x9, x13, x12, #1 +; GISEL-NEXT: stp x10, x8, [x0, #32] +; GISEL-NEXT: extr x10, x15, x13, #1 +; GISEL-NEXT: stp x9, x10, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6266,27 +6114,21 @@ define void @test_lshr_i512_const_127(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_127: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x10, [x1, #8] -; GISEL-NEXT: ldp x11, x14, [x1, #32] -; GISEL-NEXT: ldp x15, x16, [x1, #48] -; GISEL-NEXT: lsl x12, x8, #1 -; GISEL-NEXT: lsl x13, x9, #1 -; GISEL-NEXT: orr x10, x12, x10, lsr #63 -; GISEL-NEXT: lsl x12, x11, #1 -; GISEL-NEXT: orr x8, x13, x8, lsr #63 -; GISEL-NEXT: lsl x13, x14, #1 -; GISEL-NEXT: orr x9, x12, x9, lsr #63 -; GISEL-NEXT: stp x10, x8, [x0] -; GISEL-NEXT: lsl x10, x15, #1 -; GISEL-NEXT: orr x11, x13, x11, lsr #63 -; GISEL-NEXT: lsl x12, x16, #1 -; GISEL-NEXT: orr x8, x10, x14, lsr #63 -; GISEL-NEXT: lsr x10, x16, #63 -; GISEL-NEXT: stp x9, x11, [x0, #16] -; GISEL-NEXT: orr x9, x12, x15, lsr #63 -; GISEL-NEXT: stp x10, xzr, [x0, #48] -; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x14, [x1, #56] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x12, x13, [x1, #40] +; GISEL-NEXT: extr x8, x9, x8, #63 +; GISEL-NEXT: extr x9, x10, x9, #63 +; GISEL-NEXT: extr x10, x11, x10, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #63 +; GISEL-NEXT: extr x9, x13, x12, #63 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #63 +; GISEL-NEXT: lsr x8, x14, #63 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, xzr, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6317,28 +6159,22 @@ define void @test_ashr_i512_const_127(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_127: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x10, [x1, #8] -; GISEL-NEXT: ldp x11, x14, [x1, #32] -; GISEL-NEXT: ldp x15, x16, [x1, #48] -; GISEL-NEXT: lsl x12, x8, #1 -; GISEL-NEXT: lsl x13, x9, #1 -; GISEL-NEXT: orr x10, x12, x10, lsr #63 -; GISEL-NEXT: lsl x12, x11, #1 -; GISEL-NEXT: orr x8, x13, x8, lsr #63 -; GISEL-NEXT: lsl x13, x14, #1 -; GISEL-NEXT: orr x9, x12, x9, lsr #63 -; GISEL-NEXT: lsl x12, x15, #1 -; GISEL-NEXT: stp x10, x8, [x0] -; GISEL-NEXT: lsl x10, x16, #1 -; GISEL-NEXT: orr x11, x13, x11, lsr #63 -; GISEL-NEXT: asr x8, x16, #63 -; GISEL-NEXT: orr x12, x12, x14, lsr #63 -; GISEL-NEXT: stp x9, x11, [x0, #16] -; GISEL-NEXT: orr x9, x10, x15, lsr #63 -; GISEL-NEXT: orr x10, x8, x8, lsl #1 -; GISEL-NEXT: stp x12, x9, [x0, #32] -; GISEL-NEXT: stp x10, x8, [x0, #48] +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x14, [x1, #56] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x12, x13, [x1, #40] +; GISEL-NEXT: extr x8, x9, x8, #63 +; GISEL-NEXT: extr x9, x10, x9, #63 +; GISEL-NEXT: extr x10, x11, x10, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #63 +; GISEL-NEXT: asr x9, x14, #63 +; GISEL-NEXT: extr x11, x13, x12, #63 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #63 +; GISEL-NEXT: orr x8, x9, x9, lsl #1 +; GISEL-NEXT: stp x11, x10, [x0, #32] +; GISEL-NEXT: stp x8, x9, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 diff --git a/llvm/test/CodeGen/AArch64/adc.ll b/llvm/test/CodeGen/AArch64/adc.ll index 12e8bf2..03f3cf1 100644 --- a/llvm/test/CodeGen/AArch64/adc.ll +++ b/llvm/test/CodeGen/AArch64/adc.ll @@ -71,9 +71,8 @@ define i128 @test_shifted(i128 %a, i128 %b) { ; ; CHECK-GI-LABEL: test_shifted: ; CHECK-GI: ; %bb.0: -; CHECK-GI-NEXT: lsr x8, x2, #19 +; CHECK-GI-NEXT: extr x8, x3, x2, #19 ; CHECK-GI-NEXT: adds x0, x0, x2, lsl #45 -; CHECK-GI-NEXT: orr x8, x8, x3, lsl #45 ; CHECK-GI-NEXT: adc x1, x1, x8 ; CHECK-GI-NEXT: ret %rhs = shl i128 %b, 45 @@ -108,8 +107,7 @@ define i128 @test_extended(i128 %a, i16 %b) { ; CHECK-GI-NEXT: sxth x8, w2 ; CHECK-GI-NEXT: adds x0, x0, w2, sxth #3 ; CHECK-GI-NEXT: asr x9, x8, #63 -; CHECK-GI-NEXT: lsr x8, x8, #61 -; CHECK-GI-NEXT: orr x8, x8, x9, lsl #3 +; CHECK-GI-NEXT: extr x8, x9, x8, #61 ; CHECK-GI-NEXT: adc x1, x1, x8 ; CHECK-GI-NEXT: ret %ext = sext i16 %b to i128 diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll index 765f6b7..7f07ef4 100644 --- a/llvm/test/CodeGen/AArch64/fsh.ll +++ b/llvm/test/CodeGen/AArch64/fsh.ll @@ -510,41 +510,40 @@ define i128 @fshl_i128(i128 %a, i128 %b, i128 %c) { ; ; CHECK-GI-LABEL: fshl_i128: ; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, #64 // =0x40 ; CHECK-GI-NEXT: and x9, x4, #0x7f -; CHECK-GI-NEXT: mov w10, #64 // =0x40 -; CHECK-GI-NEXT: lsl x14, x3, #63 -; CHECK-GI-NEXT: sub x12, x10, x9 +; CHECK-GI-NEXT: mov w10, #127 // =0x7f +; CHECK-GI-NEXT: sub x12, x8, x9 ; CHECK-GI-NEXT: lsl x13, x1, x9 -; CHECK-GI-NEXT: mov w8, #127 // =0x7f +; CHECK-GI-NEXT: bic x10, x10, x4 ; CHECK-GI-NEXT: lsr x12, x0, x12 -; CHECK-GI-NEXT: bic x8, x8, x4 -; CHECK-GI-NEXT: sub x15, x9, #64 +; CHECK-GI-NEXT: sub x14, x9, #64 +; CHECK-GI-NEXT: lsl x15, x0, x9 +; CHECK-GI-NEXT: extr x16, x3, x2, #1 ; CHECK-GI-NEXT: cmp x9, #64 -; CHECK-GI-NEXT: lsl x9, x0, x9 -; CHECK-GI-NEXT: lsl x15, x0, x15 -; CHECK-GI-NEXT: orr x12, x12, x13 -; CHECK-GI-NEXT: orr x13, x14, x2, lsr #1 -; CHECK-GI-NEXT: lsr x14, x3, #1 -; CHECK-GI-NEXT: sub x10, x10, x8 -; CHECK-GI-NEXT: sub x16, x8, #64 -; CHECK-GI-NEXT: csel x9, x9, xzr, lo -; CHECK-GI-NEXT: lsr x17, x13, x8 -; CHECK-GI-NEXT: lsl x10, x14, x10 -; CHECK-GI-NEXT: csel x12, x12, x15, lo +; CHECK-GI-NEXT: sub x8, x8, x10 +; CHECK-GI-NEXT: orr x9, x12, x13 +; CHECK-GI-NEXT: lsr x12, x3, #1 +; CHECK-GI-NEXT: lsl x13, x0, x14 +; CHECK-GI-NEXT: csel x14, x15, xzr, lo +; CHECK-GI-NEXT: sub x15, x10, #64 +; CHECK-GI-NEXT: lsr x17, x16, x10 +; CHECK-GI-NEXT: lsl x8, x12, x8 +; CHECK-GI-NEXT: csel x9, x9, x13, lo ; CHECK-GI-NEXT: tst x4, #0x7f -; CHECK-GI-NEXT: lsr x15, x14, x16 +; CHECK-GI-NEXT: lsr x13, x12, x15 ; CHECK-GI-NEXT: mvn x11, x4 -; CHECK-GI-NEXT: csel x12, x1, x12, eq -; CHECK-GI-NEXT: orr x10, x17, x10 -; CHECK-GI-NEXT: cmp x8, #64 -; CHECK-GI-NEXT: lsr x14, x14, x8 -; CHECK-GI-NEXT: csel x10, x10, x15, lo +; CHECK-GI-NEXT: csel x9, x1, x9, eq +; CHECK-GI-NEXT: orr x8, x17, x8 +; CHECK-GI-NEXT: cmp x10, #64 +; CHECK-GI-NEXT: lsr x12, x12, x10 +; CHECK-GI-NEXT: csel x8, x8, x13, lo ; CHECK-GI-NEXT: tst x11, #0x7f -; CHECK-GI-NEXT: csel x10, x13, x10, eq -; CHECK-GI-NEXT: cmp x8, #64 -; CHECK-GI-NEXT: csel x8, x14, xzr, lo -; CHECK-GI-NEXT: orr x0, x9, x10 -; CHECK-GI-NEXT: orr x1, x12, x8 +; CHECK-GI-NEXT: csel x8, x16, x8, eq +; CHECK-GI-NEXT: cmp x10, #64 +; CHECK-GI-NEXT: csel x10, x12, xzr, lo +; CHECK-GI-NEXT: orr x0, x14, x8 +; CHECK-GI-NEXT: orr x1, x9, x10 ; CHECK-GI-NEXT: ret entry: %d = call i128 @llvm.fshl(i128 %a, i128 %b, i128 %c) @@ -571,41 +570,40 @@ define i128 @fshr_i128(i128 %a, i128 %b, i128 %c) { ; ; CHECK-GI-LABEL: fshr_i128: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsr x8, x0, #63 -; CHECK-GI-NEXT: mov w9, #127 // =0x7f -; CHECK-GI-NEXT: mov w10, #64 // =0x40 -; CHECK-GI-NEXT: bic x9, x9, x4 -; CHECK-GI-NEXT: lsl x11, x0, #1 -; CHECK-GI-NEXT: and x12, x4, #0x7f -; CHECK-GI-NEXT: orr x8, x8, x1, lsl #1 -; CHECK-GI-NEXT: sub x14, x10, x9 -; CHECK-GI-NEXT: sub x17, x9, #64 -; CHECK-GI-NEXT: lsl x15, x11, x9 -; CHECK-GI-NEXT: lsr x14, x11, x14 -; CHECK-GI-NEXT: cmp x9, #64 -; CHECK-GI-NEXT: lsl x16, x8, x9 -; CHECK-GI-NEXT: sub x9, x10, x12 -; CHECK-GI-NEXT: lsl x10, x11, x17 -; CHECK-GI-NEXT: mvn x13, x4 -; CHECK-GI-NEXT: csel x11, x15, xzr, lo -; CHECK-GI-NEXT: sub x15, x12, #64 -; CHECK-GI-NEXT: orr x14, x14, x16 -; CHECK-GI-NEXT: lsr x16, x2, x12 -; CHECK-GI-NEXT: lsl x9, x3, x9 -; CHECK-GI-NEXT: csel x10, x14, x10, lo -; CHECK-GI-NEXT: tst x13, #0x7f -; CHECK-GI-NEXT: lsr x13, x3, x15 -; CHECK-GI-NEXT: csel x8, x8, x10, eq -; CHECK-GI-NEXT: orr x9, x16, x9 -; CHECK-GI-NEXT: cmp x12, #64 -; CHECK-GI-NEXT: lsr x10, x3, x12 -; CHECK-GI-NEXT: csel x9, x9, x13, lo +; CHECK-GI-NEXT: mov w8, #127 // =0x7f +; CHECK-GI-NEXT: lsl x9, x0, #1 +; CHECK-GI-NEXT: extr x10, x1, x0, #63 +; CHECK-GI-NEXT: bic x8, x8, x4 +; CHECK-GI-NEXT: mov w11, #64 // =0x40 +; CHECK-GI-NEXT: and x14, x4, #0x7f +; CHECK-GI-NEXT: sub x12, x11, x8 +; CHECK-GI-NEXT: lsl x13, x10, x8 +; CHECK-GI-NEXT: lsl x16, x9, x8 +; CHECK-GI-NEXT: lsr x12, x9, x12 +; CHECK-GI-NEXT: sub x17, x8, #64 +; CHECK-GI-NEXT: cmp x8, #64 +; CHECK-GI-NEXT: lsl x8, x9, x17 +; CHECK-GI-NEXT: sub x11, x11, x14 +; CHECK-GI-NEXT: mvn x15, x4 +; CHECK-GI-NEXT: orr x12, x12, x13 +; CHECK-GI-NEXT: csel x9, x16, xzr, lo +; CHECK-GI-NEXT: sub x13, x14, #64 +; CHECK-GI-NEXT: lsr x16, x2, x14 +; CHECK-GI-NEXT: lsl x11, x3, x11 +; CHECK-GI-NEXT: csel x8, x12, x8, lo +; CHECK-GI-NEXT: tst x15, #0x7f +; CHECK-GI-NEXT: lsr x12, x3, x13 +; CHECK-GI-NEXT: csel x8, x10, x8, eq +; CHECK-GI-NEXT: orr x10, x16, x11 +; CHECK-GI-NEXT: cmp x14, #64 +; CHECK-GI-NEXT: lsr x11, x3, x14 +; CHECK-GI-NEXT: csel x10, x10, x12, lo ; CHECK-GI-NEXT: tst x4, #0x7f -; CHECK-GI-NEXT: csel x9, x2, x9, eq -; CHECK-GI-NEXT: cmp x12, #64 -; CHECK-GI-NEXT: csel x10, x10, xzr, lo -; CHECK-GI-NEXT: orr x0, x11, x9 -; CHECK-GI-NEXT: orr x1, x8, x10 +; CHECK-GI-NEXT: csel x10, x2, x10, eq +; CHECK-GI-NEXT: cmp x14, #64 +; CHECK-GI-NEXT: csel x11, x11, xzr, lo +; CHECK-GI-NEXT: orr x0, x9, x10 +; CHECK-GI-NEXT: orr x1, x8, x11 ; CHECK-GI-NEXT: ret entry: %d = call i128 @llvm.fshr(i128 %a, i128 %b, i128 %c) @@ -720,10 +718,9 @@ define i128 @rotl_i128_c(i128 %a) { ; ; CHECK-GI-LABEL: rotl_i128_c: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsr x8, x0, #61 -; CHECK-GI-NEXT: lsr x9, x1, #61 -; CHECK-GI-NEXT: orr x1, x8, x1, lsl #3 -; CHECK-GI-NEXT: orr x0, x9, x0, lsl #3 +; CHECK-GI-NEXT: extr x8, x1, x0, #61 +; CHECK-GI-NEXT: extr x0, x0, x1, #61 +; CHECK-GI-NEXT: mov x1, x8 ; CHECK-GI-NEXT: ret entry: %d = call i128 @llvm.fshl(i128 %a, i128 %a, i128 3) @@ -731,20 +728,12 @@ entry: } define i128 @rotr_i128_c(i128 %a) { -; CHECK-SD-LABEL: rotr_i128_c: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: extr x8, x1, x0, #3 -; CHECK-SD-NEXT: extr x1, x0, x1, #3 -; CHECK-SD-NEXT: mov x0, x8 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: rotr_i128_c: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsl x8, x1, #61 -; CHECK-GI-NEXT: lsl x9, x0, #61 -; CHECK-GI-NEXT: orr x0, x8, x0, lsr #3 -; CHECK-GI-NEXT: orr x1, x9, x1, lsr #3 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: rotr_i128_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: extr x8, x1, x0, #3 +; CHECK-NEXT: extr x1, x0, x1, #3 +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: ret entry: %d = call i128 @llvm.fshr(i128 %a, i128 %a, i128 3) ret i128 %d @@ -868,10 +857,8 @@ define i128 @fshl_i128_c(i128 %a, i128 %b) { ; ; CHECK-GI-LABEL: fshl_i128_c: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsr x8, x0, #61 -; CHECK-GI-NEXT: lsr x9, x3, #61 -; CHECK-GI-NEXT: orr x1, x8, x1, lsl #3 -; CHECK-GI-NEXT: orr x0, x9, x0, lsl #3 +; CHECK-GI-NEXT: extr x1, x1, x0, #61 +; CHECK-GI-NEXT: extr x0, x0, x3, #61 ; CHECK-GI-NEXT: ret entry: %d = call i128 @llvm.fshl(i128 %a, i128 %b, i128 3) @@ -879,21 +866,12 @@ entry: } define i128 @fshr_i128_c(i128 %a, i128 %b) { -; CHECK-SD-LABEL: fshr_i128_c: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: extr x8, x3, x2, #3 -; CHECK-SD-NEXT: extr x1, x0, x3, #3 -; CHECK-SD-NEXT: mov x0, x8 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: fshr_i128_c: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsl x8, x3, #61 -; CHECK-GI-NEXT: lsr x9, x3, #3 -; CHECK-GI-NEXT: orr x8, x8, x2, lsr #3 -; CHECK-GI-NEXT: orr x1, x9, x0, lsl #61 -; CHECK-GI-NEXT: mov x0, x8 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: fshr_i128_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: extr x8, x3, x2, #3 +; CHECK-NEXT: extr x1, x0, x3, #3 +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: ret entry: %d = call i128 @llvm.fshr(i128 %a, i128 %b, i128 3) ret i128 %d @@ -3013,75 +2991,73 @@ define <2 x i128> @fshl_v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) { ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 ; CHECK-GI-NEXT: .cfi_offset w19, -16 ; CHECK-GI-NEXT: ldr x11, [sp, #16] -; CHECK-GI-NEXT: mov w10, #64 // =0x40 +; CHECK-GI-NEXT: mov w9, #64 // =0x40 ; CHECK-GI-NEXT: ldr x12, [sp, #32] ; CHECK-GI-NEXT: mov w13, #127 // =0x7f -; CHECK-GI-NEXT: and x9, x11, #0x7f +; CHECK-GI-NEXT: and x8, x11, #0x7f ; CHECK-GI-NEXT: and x14, x12, #0x7f -; CHECK-GI-NEXT: mvn x15, x11 -; CHECK-GI-NEXT: sub x8, x10, x9 -; CHECK-GI-NEXT: sub x16, x9, #64 -; CHECK-GI-NEXT: lsl x19, x1, x9 -; CHECK-GI-NEXT: lsr x18, x0, x8 -; CHECK-GI-NEXT: lsl x17, x0, x9 -; CHECK-GI-NEXT: lsl x16, x0, x16 -; CHECK-GI-NEXT: cmp x9, #64 -; CHECK-GI-NEXT: bic x0, x13, x11 -; CHECK-GI-NEXT: mvn x8, x12 -; CHECK-GI-NEXT: orr x18, x18, x19 -; CHECK-GI-NEXT: csel x9, x17, xzr, lo +; CHECK-GI-NEXT: mvn x18, x11 +; CHECK-GI-NEXT: sub x10, x9, x8 +; CHECK-GI-NEXT: sub x15, x8, #64 +; CHECK-GI-NEXT: lsl x17, x1, x8 +; CHECK-GI-NEXT: lsr x16, x0, x10 +; CHECK-GI-NEXT: lsl x15, x0, x15 +; CHECK-GI-NEXT: cmp x8, #64 +; CHECK-GI-NEXT: lsl x19, x0, x8 +; CHECK-GI-NEXT: lsl x0, x3, x14 +; CHECK-GI-NEXT: mvn x10, x12 +; CHECK-GI-NEXT: orr x16, x16, x17 ; CHECK-GI-NEXT: sub x17, x14, #64 -; CHECK-GI-NEXT: csel x16, x18, x16, lo +; CHECK-GI-NEXT: csel x15, x16, x15, lo +; CHECK-GI-NEXT: sub x16, x9, x14 +; CHECK-GI-NEXT: csel x8, x19, xzr, lo +; CHECK-GI-NEXT: lsr x16, x2, x16 ; CHECK-GI-NEXT: tst x11, #0x7f -; CHECK-GI-NEXT: sub x11, x10, x14 -; CHECK-GI-NEXT: lsr x11, x2, x11 -; CHECK-GI-NEXT: lsl x18, x3, x14 -; CHECK-GI-NEXT: csel x16, x1, x16, eq -; CHECK-GI-NEXT: lsl x1, x2, x14 +; CHECK-GI-NEXT: lsl x19, x2, x14 ; CHECK-GI-NEXT: lsl x17, x2, x17 +; CHECK-GI-NEXT: csel x15, x1, x15, eq ; CHECK-GI-NEXT: cmp x14, #64 -; CHECK-GI-NEXT: lsl x14, x5, #63 -; CHECK-GI-NEXT: orr x11, x11, x18 -; CHECK-GI-NEXT: bic x13, x13, x12 -; CHECK-GI-NEXT: csel x18, x1, xzr, lo -; CHECK-GI-NEXT: csel x11, x11, x17, lo +; CHECK-GI-NEXT: orr x16, x16, x0 +; CHECK-GI-NEXT: bic x11, x13, x11 +; CHECK-GI-NEXT: csel x14, x19, xzr, lo +; CHECK-GI-NEXT: csel x16, x16, x17, lo ; CHECK-GI-NEXT: tst x12, #0x7f -; CHECK-GI-NEXT: lsr x12, x5, #1 -; CHECK-GI-NEXT: orr x14, x14, x4, lsr #1 -; CHECK-GI-NEXT: lsl x17, x7, #63 -; CHECK-GI-NEXT: sub x1, x10, x0 -; CHECK-GI-NEXT: csel x11, x3, x11, eq -; CHECK-GI-NEXT: sub x2, x0, #64 -; CHECK-GI-NEXT: lsr x3, x14, x0 -; CHECK-GI-NEXT: lsl x1, x12, x1 -; CHECK-GI-NEXT: lsr x4, x7, #1 -; CHECK-GI-NEXT: orr x17, x17, x6, lsr #1 -; CHECK-GI-NEXT: lsr x2, x12, x2 -; CHECK-GI-NEXT: cmp x0, #64 -; CHECK-GI-NEXT: orr x1, x3, x1 -; CHECK-GI-NEXT: sub x10, x10, x13 -; CHECK-GI-NEXT: lsr x12, x12, x0 -; CHECK-GI-NEXT: csel x1, x1, x2, lo -; CHECK-GI-NEXT: tst x15, #0x7f -; CHECK-GI-NEXT: sub x15, x13, #64 -; CHECK-GI-NEXT: lsr x2, x17, x13 -; CHECK-GI-NEXT: lsl x10, x4, x10 -; CHECK-GI-NEXT: csel x14, x14, x1, eq -; CHECK-GI-NEXT: cmp x0, #64 -; CHECK-GI-NEXT: lsr x15, x4, x15 -; CHECK-GI-NEXT: lsr x0, x4, x13 -; CHECK-GI-NEXT: csel x12, x12, xzr, lo -; CHECK-GI-NEXT: orr x10, x2, x10 -; CHECK-GI-NEXT: cmp x13, #64 -; CHECK-GI-NEXT: csel x10, x10, x15, lo -; CHECK-GI-NEXT: tst x8, #0x7f -; CHECK-GI-NEXT: orr x1, x16, x12 -; CHECK-GI-NEXT: csel x8, x17, x10, eq -; CHECK-GI-NEXT: cmp x13, #64 -; CHECK-GI-NEXT: csel x10, x0, xzr, lo -; CHECK-GI-NEXT: orr x0, x9, x14 -; CHECK-GI-NEXT: orr x2, x18, x8 -; CHECK-GI-NEXT: orr x3, x11, x10 +; CHECK-GI-NEXT: lsr x17, x5, #1 +; CHECK-GI-NEXT: extr x0, x5, x4, #1 +; CHECK-GI-NEXT: bic x12, x13, x12 +; CHECK-GI-NEXT: csel x13, x3, x16, eq +; CHECK-GI-NEXT: sub x16, x9, x11 +; CHECK-GI-NEXT: sub x1, x11, #64 +; CHECK-GI-NEXT: lsr x3, x7, #1 +; CHECK-GI-NEXT: lsr x2, x0, x11 +; CHECK-GI-NEXT: lsl x16, x17, x16 +; CHECK-GI-NEXT: extr x4, x7, x6, #1 +; CHECK-GI-NEXT: lsr x1, x17, x1 +; CHECK-GI-NEXT: cmp x11, #64 +; CHECK-GI-NEXT: sub x9, x9, x12 +; CHECK-GI-NEXT: orr x16, x2, x16 +; CHECK-GI-NEXT: lsr x17, x17, x11 +; CHECK-GI-NEXT: lsl x9, x3, x9 +; CHECK-GI-NEXT: csel x16, x16, x1, lo +; CHECK-GI-NEXT: tst x18, #0x7f +; CHECK-GI-NEXT: sub x18, x12, #64 +; CHECK-GI-NEXT: lsr x1, x4, x12 +; CHECK-GI-NEXT: csel x16, x0, x16, eq +; CHECK-GI-NEXT: cmp x11, #64 +; CHECK-GI-NEXT: lsr x11, x3, x18 +; CHECK-GI-NEXT: csel x17, x17, xzr, lo +; CHECK-GI-NEXT: cmp x12, #64 +; CHECK-GI-NEXT: orr x9, x1, x9 +; CHECK-GI-NEXT: lsr x18, x3, x12 +; CHECK-GI-NEXT: orr x0, x8, x16 +; CHECK-GI-NEXT: csel x9, x9, x11, lo +; CHECK-GI-NEXT: tst x10, #0x7f +; CHECK-GI-NEXT: orr x1, x15, x17 +; CHECK-GI-NEXT: csel x9, x4, x9, eq +; CHECK-GI-NEXT: cmp x12, #64 +; CHECK-GI-NEXT: csel x10, x18, xzr, lo +; CHECK-GI-NEXT: orr x2, x14, x9 +; CHECK-GI-NEXT: orr x3, x13, x10 ; CHECK-GI-NEXT: ldr x19, [sp], #16 // 8-byte Folded Reload ; CHECK-GI-NEXT: ret entry: @@ -3125,75 +3101,73 @@ define <2 x i128> @fshr_v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) { ; CHECK-GI-LABEL: fshr_v2i128: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldr x9, [sp] -; CHECK-GI-NEXT: lsl x12, x1, #1 -; CHECK-GI-NEXT: mov w11, #127 // =0x7f -; CHECK-GI-NEXT: mov w14, #64 // =0x40 -; CHECK-GI-NEXT: lsl x15, x0, #1 +; CHECK-GI-NEXT: mov w10, #127 // =0x7f +; CHECK-GI-NEXT: mov w12, #64 // =0x40 +; CHECK-GI-NEXT: lsl x13, x0, #1 +; CHECK-GI-NEXT: extr x14, x1, x0, #63 ; CHECK-GI-NEXT: ldr x8, [sp, #16] -; CHECK-GI-NEXT: bic x13, x11, x9 -; CHECK-GI-NEXT: orr x12, x12, x0, lsr #63 -; CHECK-GI-NEXT: lsl x1, x3, #1 -; CHECK-GI-NEXT: sub x17, x14, x13 -; CHECK-GI-NEXT: sub x18, x13, #64 -; CHECK-GI-NEXT: lsl x3, x15, x13 -; CHECK-GI-NEXT: lsr x17, x15, x17 -; CHECK-GI-NEXT: lsl x0, x12, x13 -; CHECK-GI-NEXT: lsl x15, x15, x18 -; CHECK-GI-NEXT: bic x11, x11, x8 +; CHECK-GI-NEXT: bic x11, x10, x9 +; CHECK-GI-NEXT: mvn x16, x9 +; CHECK-GI-NEXT: and x15, x9, #0x7f +; CHECK-GI-NEXT: sub x17, x12, x11 +; CHECK-GI-NEXT: sub x18, x11, #64 +; CHECK-GI-NEXT: lsl x0, x14, x11 +; CHECK-GI-NEXT: lsr x17, x13, x17 +; CHECK-GI-NEXT: lsl x1, x13, x11 +; CHECK-GI-NEXT: lsl x13, x13, x18 +; CHECK-GI-NEXT: bic x10, x10, x8 ; CHECK-GI-NEXT: lsl x18, x2, #1 -; CHECK-GI-NEXT: cmp x13, #64 +; CHECK-GI-NEXT: cmp x11, #64 ; CHECK-GI-NEXT: orr x17, x17, x0 -; CHECK-GI-NEXT: orr x13, x1, x2, lsr #63 -; CHECK-GI-NEXT: mvn x16, x9 -; CHECK-GI-NEXT: csel x15, x17, x15, lo -; CHECK-GI-NEXT: sub x17, x14, x11 -; CHECK-GI-NEXT: csel x0, x3, xzr, lo +; CHECK-GI-NEXT: extr x11, x3, x2, #63 +; CHECK-GI-NEXT: csel x0, x1, xzr, lo +; CHECK-GI-NEXT: csel x13, x17, x13, lo +; CHECK-GI-NEXT: sub x17, x12, x10 ; CHECK-GI-NEXT: tst x16, #0x7f -; CHECK-GI-NEXT: sub x16, x11, #64 +; CHECK-GI-NEXT: sub x16, x10, #64 ; CHECK-GI-NEXT: lsr x17, x18, x17 -; CHECK-GI-NEXT: lsl x2, x13, x11 -; CHECK-GI-NEXT: lsl x1, x18, x11 -; CHECK-GI-NEXT: csel x12, x12, x15, eq -; CHECK-GI-NEXT: lsl x15, x18, x16 -; CHECK-GI-NEXT: and x10, x9, #0x7f -; CHECK-GI-NEXT: cmp x11, #64 -; CHECK-GI-NEXT: mvn x11, x8 +; CHECK-GI-NEXT: lsl x2, x11, x10 +; CHECK-GI-NEXT: lsl x1, x18, x10 +; CHECK-GI-NEXT: csel x13, x14, x13, eq +; CHECK-GI-NEXT: lsl x14, x18, x16 +; CHECK-GI-NEXT: cmp x10, #64 +; CHECK-GI-NEXT: mvn x10, x8 ; CHECK-GI-NEXT: orr x16, x17, x2 ; CHECK-GI-NEXT: csel x17, x1, xzr, lo -; CHECK-GI-NEXT: csel x15, x16, x15, lo -; CHECK-GI-NEXT: tst x11, #0x7f -; CHECK-GI-NEXT: sub x11, x14, x10 -; CHECK-GI-NEXT: sub x16, x10, #64 -; CHECK-GI-NEXT: lsr x18, x4, x10 -; CHECK-GI-NEXT: lsl x11, x5, x11 -; CHECK-GI-NEXT: csel x13, x13, x15, eq -; CHECK-GI-NEXT: lsr x15, x5, x16 +; CHECK-GI-NEXT: csel x14, x16, x14, lo +; CHECK-GI-NEXT: tst x10, #0x7f +; CHECK-GI-NEXT: sub x10, x12, x15 +; CHECK-GI-NEXT: sub x16, x15, #64 +; CHECK-GI-NEXT: lsr x18, x4, x15 +; CHECK-GI-NEXT: lsl x10, x5, x10 +; CHECK-GI-NEXT: csel x11, x11, x14, eq +; CHECK-GI-NEXT: lsr x14, x5, x16 ; CHECK-GI-NEXT: and x1, x8, #0x7f -; CHECK-GI-NEXT: orr x11, x18, x11 -; CHECK-GI-NEXT: cmp x10, #64 -; CHECK-GI-NEXT: lsr x16, x5, x10 -; CHECK-GI-NEXT: csel x11, x11, x15, lo +; CHECK-GI-NEXT: cmp x15, #64 +; CHECK-GI-NEXT: lsr x16, x5, x15 +; CHECK-GI-NEXT: orr x10, x18, x10 +; CHECK-GI-NEXT: csel x10, x10, x14, lo ; CHECK-GI-NEXT: tst x9, #0x7f -; CHECK-GI-NEXT: sub x9, x14, x1 -; CHECK-GI-NEXT: sub x14, x1, #64 -; CHECK-GI-NEXT: lsr x15, x6, x1 +; CHECK-GI-NEXT: sub x9, x12, x1 +; CHECK-GI-NEXT: sub x12, x1, #64 +; CHECK-GI-NEXT: lsr x14, x6, x1 ; CHECK-GI-NEXT: lsl x9, x7, x9 -; CHECK-GI-NEXT: csel x11, x4, x11, eq -; CHECK-GI-NEXT: cmp x10, #64 -; CHECK-GI-NEXT: lsr x10, x7, x14 -; CHECK-GI-NEXT: csel x14, x16, xzr, lo -; CHECK-GI-NEXT: orr x9, x15, x9 +; CHECK-GI-NEXT: csel x10, x4, x10, eq +; CHECK-GI-NEXT: cmp x15, #64 +; CHECK-GI-NEXT: lsr x12, x7, x12 +; CHECK-GI-NEXT: csel x15, x16, xzr, lo +; CHECK-GI-NEXT: orr x9, x14, x9 ; CHECK-GI-NEXT: cmp x1, #64 -; CHECK-GI-NEXT: lsr x15, x7, x1 -; CHECK-GI-NEXT: csel x9, x9, x10, lo +; CHECK-GI-NEXT: lsr x14, x7, x1 +; CHECK-GI-NEXT: csel x9, x9, x12, lo ; CHECK-GI-NEXT: tst x8, #0x7f ; CHECK-GI-NEXT: csel x8, x6, x9, eq ; CHECK-GI-NEXT: cmp x1, #64 -; CHECK-GI-NEXT: orr x0, x0, x11 -; CHECK-GI-NEXT: csel x9, x15, xzr, lo -; CHECK-GI-NEXT: orr x1, x12, x14 +; CHECK-GI-NEXT: orr x0, x0, x10 +; CHECK-GI-NEXT: csel x9, x14, xzr, lo +; CHECK-GI-NEXT: orr x1, x13, x15 ; CHECK-GI-NEXT: orr x2, x17, x8 -; CHECK-GI-NEXT: orr x3, x13, x9 +; CHECK-GI-NEXT: orr x3, x11, x9 ; CHECK-GI-NEXT: ret entry: %d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) @@ -3863,15 +3837,12 @@ define <2 x i128> @rotl_v2i128_c(<2 x i128> %a) { ; ; CHECK-GI-LABEL: rotl_v2i128_c: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsr x8, x1, #61 -; CHECK-GI-NEXT: lsl x9, x1, #3 -; CHECK-GI-NEXT: lsl x10, x3, #3 -; CHECK-GI-NEXT: lsr x11, x3, #61 -; CHECK-GI-NEXT: orr x8, x8, x0, lsl #3 -; CHECK-GI-NEXT: orr x1, x9, x0, lsr #61 -; CHECK-GI-NEXT: orr x3, x10, x2, lsr #61 -; CHECK-GI-NEXT: orr x2, x11, x2, lsl #3 +; CHECK-GI-NEXT: extr x8, x0, x1, #61 +; CHECK-GI-NEXT: extr x9, x3, x2, #61 +; CHECK-GI-NEXT: extr x1, x1, x0, #61 +; CHECK-GI-NEXT: extr x2, x2, x3, #61 ; CHECK-GI-NEXT: mov x0, x8 +; CHECK-GI-NEXT: mov x3, x9 ; CHECK-GI-NEXT: ret entry: %d = call <2 x i128> @llvm.fshl(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 3, i128 3>) @@ -3891,14 +3862,12 @@ define <2 x i128> @rotr_v2i128_c(<2 x i128> %a) { ; ; CHECK-GI-LABEL: rotr_v2i128_c: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsl x8, x1, #61 -; CHECK-GI-NEXT: lsl x9, x3, #61 -; CHECK-GI-NEXT: lsl x10, x0, #61 -; CHECK-GI-NEXT: lsl x11, x2, #61 -; CHECK-GI-NEXT: orr x0, x8, x0, lsr #3 -; CHECK-GI-NEXT: orr x2, x9, x2, lsr #3 -; CHECK-GI-NEXT: orr x1, x10, x1, lsr #3 -; CHECK-GI-NEXT: orr x3, x11, x3, lsr #3 +; CHECK-GI-NEXT: extr x8, x1, x0, #3 +; CHECK-GI-NEXT: extr x9, x3, x2, #3 +; CHECK-GI-NEXT: extr x1, x0, x1, #3 +; CHECK-GI-NEXT: extr x3, x2, x3, #3 +; CHECK-GI-NEXT: mov x0, x8 +; CHECK-GI-NEXT: mov x2, x9 ; CHECK-GI-NEXT: ret entry: %d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 3, i128 3>) @@ -4464,14 +4433,10 @@ define <2 x i128> @fshl_v2i128_c(<2 x i128> %a, <2 x i128> %b) { ; ; CHECK-GI-LABEL: fshl_v2i128_c: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsr x8, x5, #61 -; CHECK-GI-NEXT: lsl x9, x1, #3 -; CHECK-GI-NEXT: lsl x10, x3, #3 -; CHECK-GI-NEXT: lsr x11, x7, #61 -; CHECK-GI-NEXT: orr x8, x8, x0, lsl #3 -; CHECK-GI-NEXT: orr x1, x9, x0, lsr #61 -; CHECK-GI-NEXT: orr x3, x10, x2, lsr #61 -; CHECK-GI-NEXT: orr x2, x11, x2, lsl #3 +; CHECK-GI-NEXT: extr x8, x0, x5, #61 +; CHECK-GI-NEXT: extr x1, x1, x0, #61 +; CHECK-GI-NEXT: extr x3, x3, x2, #61 +; CHECK-GI-NEXT: extr x2, x2, x7, #61 ; CHECK-GI-NEXT: mov x0, x8 ; CHECK-GI-NEXT: ret entry: @@ -4480,29 +4445,15 @@ entry: } define <2 x i128> @fshr_v2i128_c(<2 x i128> %a, <2 x i128> %b) { -; CHECK-SD-LABEL: fshr_v2i128_c: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: extr x8, x5, x4, #3 -; CHECK-SD-NEXT: extr x9, x7, x6, #3 -; CHECK-SD-NEXT: extr x1, x0, x5, #3 -; CHECK-SD-NEXT: extr x3, x2, x7, #3 -; CHECK-SD-NEXT: mov x0, x8 -; CHECK-SD-NEXT: mov x2, x9 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: fshr_v2i128_c: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsl x8, x5, #61 -; CHECK-GI-NEXT: lsl x9, x7, #61 -; CHECK-GI-NEXT: lsr x10, x5, #3 -; CHECK-GI-NEXT: lsr x11, x7, #3 -; CHECK-GI-NEXT: orr x8, x8, x4, lsr #3 -; CHECK-GI-NEXT: orr x9, x9, x6, lsr #3 -; CHECK-GI-NEXT: orr x1, x10, x0, lsl #61 -; CHECK-GI-NEXT: orr x3, x11, x2, lsl #61 -; CHECK-GI-NEXT: mov x0, x8 -; CHECK-GI-NEXT: mov x2, x9 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: fshr_v2i128_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: extr x8, x5, x4, #3 +; CHECK-NEXT: extr x9, x7, x6, #3 +; CHECK-NEXT: extr x1, x0, x5, #3 +; CHECK-NEXT: extr x3, x2, x7, #3 +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: mov x2, x9 +; CHECK-NEXT: ret entry: %d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 3, i128 3>) ret <2 x i128> %d diff --git a/llvm/test/CodeGen/AArch64/funnel-shift.ll b/llvm/test/CodeGen/AArch64/funnel-shift.ll index f9fd2ad..90fb102 100644 --- a/llvm/test/CodeGen/AArch64/funnel-shift.ll +++ b/llvm/test/CodeGen/AArch64/funnel-shift.ll @@ -85,41 +85,40 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind { ; ; CHECK-GI-LABEL: fshl_i128: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #64 // =0x40 ; CHECK-GI-NEXT: and x9, x4, #0x7f -; CHECK-GI-NEXT: mov w10, #64 // =0x40 -; CHECK-GI-NEXT: lsl x14, x3, #63 -; CHECK-GI-NEXT: sub x12, x10, x9 +; CHECK-GI-NEXT: mov w10, #127 // =0x7f +; CHECK-GI-NEXT: sub x12, x8, x9 ; CHECK-GI-NEXT: lsl x13, x1, x9 -; CHECK-GI-NEXT: mov w8, #127 // =0x7f +; CHECK-GI-NEXT: bic x10, x10, x4 ; CHECK-GI-NEXT: lsr x12, x0, x12 -; CHECK-GI-NEXT: bic x8, x8, x4 -; CHECK-GI-NEXT: sub x15, x9, #64 +; CHECK-GI-NEXT: sub x14, x9, #64 +; CHECK-GI-NEXT: lsl x15, x0, x9 +; CHECK-GI-NEXT: extr x16, x3, x2, #1 ; CHECK-GI-NEXT: cmp x9, #64 -; CHECK-GI-NEXT: lsl x9, x0, x9 -; CHECK-GI-NEXT: lsl x15, x0, x15 -; CHECK-GI-NEXT: orr x12, x12, x13 -; CHECK-GI-NEXT: orr x13, x14, x2, lsr #1 -; CHECK-GI-NEXT: lsr x14, x3, #1 -; CHECK-GI-NEXT: sub x10, x10, x8 -; CHECK-GI-NEXT: sub x16, x8, #64 -; CHECK-GI-NEXT: csel x9, x9, xzr, lo -; CHECK-GI-NEXT: lsr x17, x13, x8 -; CHECK-GI-NEXT: lsl x10, x14, x10 -; CHECK-GI-NEXT: csel x12, x12, x15, lo +; CHECK-GI-NEXT: sub x8, x8, x10 +; CHECK-GI-NEXT: orr x9, x12, x13 +; CHECK-GI-NEXT: lsr x12, x3, #1 +; CHECK-GI-NEXT: lsl x13, x0, x14 +; CHECK-GI-NEXT: csel x14, x15, xzr, lo +; CHECK-GI-NEXT: sub x15, x10, #64 +; CHECK-GI-NEXT: lsr x17, x16, x10 +; CHECK-GI-NEXT: lsl x8, x12, x8 +; CHECK-GI-NEXT: csel x9, x9, x13, lo ; CHECK-GI-NEXT: tst x4, #0x7f -; CHECK-GI-NEXT: lsr x15, x14, x16 +; CHECK-GI-NEXT: lsr x13, x12, x15 ; CHECK-GI-NEXT: mvn x11, x4 -; CHECK-GI-NEXT: csel x12, x1, x12, eq -; CHECK-GI-NEXT: orr x10, x17, x10 -; CHECK-GI-NEXT: cmp x8, #64 -; CHECK-GI-NEXT: lsr x14, x14, x8 -; CHECK-GI-NEXT: csel x10, x10, x15, lo +; CHECK-GI-NEXT: csel x9, x1, x9, eq +; CHECK-GI-NEXT: orr x8, x17, x8 +; CHECK-GI-NEXT: cmp x10, #64 +; CHECK-GI-NEXT: lsr x12, x12, x10 +; CHECK-GI-NEXT: csel x8, x8, x13, lo ; CHECK-GI-NEXT: tst x11, #0x7f -; CHECK-GI-NEXT: csel x10, x13, x10, eq -; CHECK-GI-NEXT: cmp x8, #64 -; CHECK-GI-NEXT: csel x8, x14, xzr, lo -; CHECK-GI-NEXT: orr x0, x9, x10 -; CHECK-GI-NEXT: orr x1, x12, x8 +; CHECK-GI-NEXT: csel x8, x16, x8, eq +; CHECK-GI-NEXT: cmp x10, #64 +; CHECK-GI-NEXT: csel x10, x12, xzr, lo +; CHECK-GI-NEXT: orr x0, x14, x8 +; CHECK-GI-NEXT: orr x1, x9, x10 ; CHECK-GI-NEXT: ret %f = call i128 @llvm.fshl.i128(i128 %x, i128 %y, i128 %z) ret i128 %f diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll index 1cb92e4..87b1108 100644 --- a/llvm/test/CodeGen/AArch64/rem-by-const.ll +++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll @@ -559,20 +559,18 @@ define i128 @ui128_7(i128 %a, i128 %b) { ; CHECK-GI-NEXT: add x8, x8, x10 ; CHECK-GI-NEXT: subs x10, x0, x9 ; CHECK-GI-NEXT: sbc x11, x1, x8 -; CHECK-GI-NEXT: lsl x12, x11, #63 +; CHECK-GI-NEXT: extr x10, x11, x10, #1 ; CHECK-GI-NEXT: lsr x11, x11, #1 -; CHECK-GI-NEXT: orr x10, x12, x10, lsr #1 ; CHECK-GI-NEXT: adds x9, x10, x9 +; CHECK-GI-NEXT: mov w10, #7 // =0x7 ; CHECK-GI-NEXT: adc x8, x11, x8 -; CHECK-GI-NEXT: lsl x10, x8, #62 +; CHECK-GI-NEXT: extr x9, x8, x9, #2 ; CHECK-GI-NEXT: lsr x8, x8, #2 -; CHECK-GI-NEXT: orr x9, x10, x9, lsr #2 -; CHECK-GI-NEXT: mov w10, #7 // =0x7 -; CHECK-GI-NEXT: lsl x12, x8, #3 ; CHECK-GI-NEXT: umulh x10, x9, x10 ; CHECK-GI-NEXT: lsl x11, x9, #3 -; CHECK-GI-NEXT: sub x8, x12, x8 +; CHECK-GI-NEXT: lsl x12, x8, #3 ; CHECK-GI-NEXT: sub x9, x11, x9 +; CHECK-GI-NEXT: sub x8, x12, x8 ; CHECK-GI-NEXT: subs x0, x0, x9 ; CHECK-GI-NEXT: add x8, x8, x10 ; CHECK-GI-NEXT: sbc x1, x1, x8 @@ -640,10 +638,9 @@ define i128 @ui128_100(i128 %a, i128 %b) { ; CHECK-GI-NEXT: add x10, x11, x12 ; CHECK-GI-NEXT: add x8, x8, x14 ; CHECK-GI-NEXT: add x8, x8, x10 -; CHECK-GI-NEXT: lsl x10, x8, #60 -; CHECK-GI-NEXT: lsr x8, x8, #4 -; CHECK-GI-NEXT: orr x9, x10, x9, lsr #4 ; CHECK-GI-NEXT: mov w10, #100 // =0x64 +; CHECK-GI-NEXT: extr x9, x8, x9, #4 +; CHECK-GI-NEXT: lsr x8, x8, #4 ; CHECK-GI-NEXT: umulh x11, x9, x10 ; CHECK-GI-NEXT: mul x9, x9, x10 ; CHECK-GI-NEXT: madd x8, x8, x10, x11 @@ -3317,36 +3314,32 @@ define <2 x i128> @uv2i128_7(<2 x i128> %d, <2 x i128> %e) { ; CHECK-GI-NEXT: sbc x14, x1, x12 ; CHECK-GI-NEXT: add x8, x8, x13 ; CHECK-GI-NEXT: subs x13, x2, x10 -; CHECK-GI-NEXT: lsl x15, x14, #63 -; CHECK-GI-NEXT: sbc x16, x3, x8 +; CHECK-GI-NEXT: extr x9, x14, x9, #1 +; CHECK-GI-NEXT: sbc x15, x3, x8 ; CHECK-GI-NEXT: lsr x14, x14, #1 -; CHECK-GI-NEXT: orr x9, x15, x9, lsr #1 -; CHECK-GI-NEXT: lsl x15, x16, #63 -; CHECK-GI-NEXT: orr x13, x15, x13, lsr #1 +; CHECK-GI-NEXT: extr x13, x15, x13, #1 ; CHECK-GI-NEXT: adds x9, x9, x11 -; CHECK-GI-NEXT: lsr x11, x16, #1 +; CHECK-GI-NEXT: lsr x11, x15, #1 ; CHECK-GI-NEXT: adc x12, x14, x12 ; CHECK-GI-NEXT: adds x10, x13, x10 -; CHECK-GI-NEXT: lsl x13, x12, #62 -; CHECK-GI-NEXT: lsr x12, x12, #2 -; CHECK-GI-NEXT: adc x8, x11, x8 -; CHECK-GI-NEXT: lsl x11, x8, #62 -; CHECK-GI-NEXT: orr x9, x13, x9, lsr #2 +; CHECK-GI-NEXT: extr x9, x12, x9, #2 ; CHECK-GI-NEXT: mov w13, #7 // =0x7 +; CHECK-GI-NEXT: adc x8, x11, x8 +; CHECK-GI-NEXT: lsr x11, x12, #2 +; CHECK-GI-NEXT: extr x10, x8, x10, #2 +; CHECK-GI-NEXT: umulh x12, x9, x13 ; CHECK-GI-NEXT: lsr x8, x8, #2 -; CHECK-GI-NEXT: lsl x14, x12, #3 -; CHECK-GI-NEXT: orr x10, x11, x10, lsr #2 -; CHECK-GI-NEXT: umulh x11, x9, x13 +; CHECK-GI-NEXT: lsl x14, x11, #3 ; CHECK-GI-NEXT: lsl x15, x9, #3 -; CHECK-GI-NEXT: sub x12, x14, x12 -; CHECK-GI-NEXT: lsl x16, x8, #3 ; CHECK-GI-NEXT: umulh x13, x10, x13 +; CHECK-GI-NEXT: lsl x16, x8, #3 +; CHECK-GI-NEXT: sub x11, x14, x11 ; CHECK-GI-NEXT: lsl x14, x10, #3 ; CHECK-GI-NEXT: sub x9, x15, x9 ; CHECK-GI-NEXT: sub x8, x16, x8 ; CHECK-GI-NEXT: subs x0, x0, x9 +; CHECK-GI-NEXT: add x11, x11, x12 ; CHECK-GI-NEXT: sub x10, x14, x10 -; CHECK-GI-NEXT: add x11, x12, x11 ; CHECK-GI-NEXT: sbc x1, x1, x11 ; CHECK-GI-NEXT: subs x2, x2, x10 ; CHECK-GI-NEXT: add x8, x8, x13 @@ -3394,9 +3387,10 @@ define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov x10, #23593 // =0x5c29 ; CHECK-GI-NEXT: mov x8, #62914 // =0xf5c2 -; CHECK-GI-NEXT: sub x18, x0, x0 +; CHECK-GI-NEXT: and x5, xzr, #0x1 ; CHECK-GI-NEXT: movk x10, #49807, lsl #16 ; CHECK-GI-NEXT: movk x8, #23592, lsl #16 +; CHECK-GI-NEXT: umulh x18, x0, xzr ; CHECK-GI-NEXT: movk x10, #10485, lsl #32 ; CHECK-GI-NEXT: movk x8, #49807, lsl #32 ; CHECK-GI-NEXT: movk x10, #36700, lsl #48 @@ -3409,84 +3403,81 @@ define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) { ; CHECK-GI-NEXT: umulh x15, x1, x10 ; CHECK-GI-NEXT: cset w12, hs ; CHECK-GI-NEXT: cmn x11, x13 -; CHECK-GI-NEXT: and x11, x12, #0x1 -; CHECK-GI-NEXT: umulh x16, x0, x8 -; CHECK-GI-NEXT: cset w12, hs +; CHECK-GI-NEXT: sub x13, x0, x0 ; CHECK-GI-NEXT: and x12, x12, #0x1 -; CHECK-GI-NEXT: add x14, x14, x18 -; CHECK-GI-NEXT: add x11, x11, x12 -; CHECK-GI-NEXT: and x12, xzr, #0x1 +; CHECK-GI-NEXT: umulh x16, x0, x8 +; CHECK-GI-NEXT: cset w11, hs +; CHECK-GI-NEXT: add x13, x14, x13 +; CHECK-GI-NEXT: and x11, x11, #0x1 +; CHECK-GI-NEXT: and x14, xzr, #0x1 ; CHECK-GI-NEXT: umulh x9, xzr, x10 -; CHECK-GI-NEXT: adds x14, x14, x15 -; CHECK-GI-NEXT: and x15, xzr, #0x1 +; CHECK-GI-NEXT: add x11, x12, x11 +; CHECK-GI-NEXT: add x12, x5, x14 +; CHECK-GI-NEXT: adds x13, x13, x15 ; CHECK-GI-NEXT: umulh x17, x1, x8 -; CHECK-GI-NEXT: cset w4, hs -; CHECK-GI-NEXT: add x15, x12, x15 -; CHECK-GI-NEXT: adds x12, x14, x16 -; CHECK-GI-NEXT: and x4, x4, #0x1 -; CHECK-GI-NEXT: mul x18, x3, x10 ; CHECK-GI-NEXT: cset w14, hs -; CHECK-GI-NEXT: adds x12, x12, x11 -; CHECK-GI-NEXT: add x11, x15, x4 ; CHECK-GI-NEXT: and x14, x14, #0x1 -; CHECK-GI-NEXT: cset w15, hs -; CHECK-GI-NEXT: mul x5, x2, x8 -; CHECK-GI-NEXT: add x11, x11, x14 -; CHECK-GI-NEXT: and x14, x15, #0x1 -; CHECK-GI-NEXT: add x17, x9, x17 -; CHECK-GI-NEXT: add x14, x11, x14 -; CHECK-GI-NEXT: mov w11, #100 // =0x64 -; CHECK-GI-NEXT: umulh x13, x0, xzr -; CHECK-GI-NEXT: umulh x16, x2, x10 -; CHECK-GI-NEXT: adds x18, x18, x5 -; CHECK-GI-NEXT: mul x15, x3, x8 -; CHECK-GI-NEXT: add x13, x17, x13 -; CHECK-GI-NEXT: cset w17, hs -; CHECK-GI-NEXT: umulh x10, x3, x10 -; CHECK-GI-NEXT: add x13, x13, x14 -; CHECK-GI-NEXT: and x17, x17, #0x1 -; CHECK-GI-NEXT: cmn x18, x16 -; CHECK-GI-NEXT: sub x18, x2, x2 -; CHECK-GI-NEXT: umulh x16, x2, x8 +; CHECK-GI-NEXT: adds x13, x13, x16 +; CHECK-GI-NEXT: mul x4, x3, x10 +; CHECK-GI-NEXT: add x12, x12, x14 ; CHECK-GI-NEXT: cset w14, hs -; CHECK-GI-NEXT: and x14, x14, #0x1 -; CHECK-GI-NEXT: add x15, x15, x18 +; CHECK-GI-NEXT: adds x11, x13, x11 +; CHECK-GI-NEXT: and x13, x14, #0x1 +; CHECK-GI-NEXT: mul x15, x2, x8 +; CHECK-GI-NEXT: cset w14, hs +; CHECK-GI-NEXT: add x12, x12, x13 +; CHECK-GI-NEXT: and x13, x14, #0x1 +; CHECK-GI-NEXT: add x14, x9, x17 +; CHECK-GI-NEXT: sub x17, x2, x2 +; CHECK-GI-NEXT: umulh x16, x2, x10 +; CHECK-GI-NEXT: add x12, x12, x13 +; CHECK-GI-NEXT: add x13, x14, x18 +; CHECK-GI-NEXT: add x12, x13, x12 ; CHECK-GI-NEXT: and x18, xzr, #0x1 -; CHECK-GI-NEXT: add x14, x17, x14 +; CHECK-GI-NEXT: mul x5, x3, x8 +; CHECK-GI-NEXT: extr x11, x12, x11, #4 +; CHECK-GI-NEXT: adds x13, x4, x15 +; CHECK-GI-NEXT: umulh x14, x3, x10 +; CHECK-GI-NEXT: cset w15, hs +; CHECK-GI-NEXT: mov w10, #100 // =0x64 +; CHECK-GI-NEXT: cmn x13, x16 +; CHECK-GI-NEXT: and x15, x15, #0x1 +; CHECK-GI-NEXT: umulh x13, x2, x8 +; CHECK-GI-NEXT: cset w16, hs +; CHECK-GI-NEXT: add x17, x5, x17 +; CHECK-GI-NEXT: and x16, x16, #0x1 ; CHECK-GI-NEXT: umulh x8, x3, x8 +; CHECK-GI-NEXT: add x15, x15, x16 +; CHECK-GI-NEXT: adds x14, x17, x14 ; CHECK-GI-NEXT: and x17, xzr, #0x1 -; CHECK-GI-NEXT: adds x10, x15, x10 -; CHECK-GI-NEXT: add x15, x17, x18 +; CHECK-GI-NEXT: add x16, x18, x17 ; CHECK-GI-NEXT: cset w17, hs -; CHECK-GI-NEXT: umulh x18, x2, xzr +; CHECK-GI-NEXT: adds x13, x14, x13 +; CHECK-GI-NEXT: umulh x14, x2, xzr ; CHECK-GI-NEXT: and x17, x17, #0x1 -; CHECK-GI-NEXT: adds x10, x10, x16 -; CHECK-GI-NEXT: lsl x16, x13, #60 -; CHECK-GI-NEXT: add x15, x15, x17 -; CHECK-GI-NEXT: cset w17, hs -; CHECK-GI-NEXT: adds x10, x10, x14 -; CHECK-GI-NEXT: and x14, x17, #0x1 +; CHECK-GI-NEXT: cset w18, hs +; CHECK-GI-NEXT: adds x13, x13, x15 +; CHECK-GI-NEXT: add x15, x16, x17 +; CHECK-GI-NEXT: and x16, x18, #0x1 ; CHECK-GI-NEXT: cset w17, hs ; CHECK-GI-NEXT: add x8, x9, x8 -; CHECK-GI-NEXT: add x14, x15, x14 -; CHECK-GI-NEXT: and x15, x17, #0x1 -; CHECK-GI-NEXT: orr x12, x16, x12, lsr #4 -; CHECK-GI-NEXT: add x9, x14, x15 -; CHECK-GI-NEXT: add x8, x8, x18 -; CHECK-GI-NEXT: add x8, x8, x9 -; CHECK-GI-NEXT: lsr x9, x13, #4 -; CHECK-GI-NEXT: umulh x14, x12, x11 -; CHECK-GI-NEXT: lsl x13, x8, #60 +; CHECK-GI-NEXT: add x15, x15, x16 +; CHECK-GI-NEXT: and x16, x17, #0x1 +; CHECK-GI-NEXT: lsr x9, x12, #4 +; CHECK-GI-NEXT: add x15, x15, x16 +; CHECK-GI-NEXT: umulh x17, x11, x10 +; CHECK-GI-NEXT: add x8, x8, x14 +; CHECK-GI-NEXT: add x8, x8, x15 +; CHECK-GI-NEXT: mul x11, x11, x10 +; CHECK-GI-NEXT: extr x12, x8, x13, #4 ; CHECK-GI-NEXT: lsr x8, x8, #4 -; CHECK-GI-NEXT: mul x12, x12, x11 -; CHECK-GI-NEXT: orr x10, x13, x10, lsr #4 -; CHECK-GI-NEXT: madd x9, x9, x11, x14 -; CHECK-GI-NEXT: umulh x13, x10, x11 -; CHECK-GI-NEXT: subs x0, x0, x12 -; CHECK-GI-NEXT: mul x10, x10, x11 +; CHECK-GI-NEXT: madd x9, x9, x10, x17 +; CHECK-GI-NEXT: umulh x13, x12, x10 +; CHECK-GI-NEXT: subs x0, x0, x11 +; CHECK-GI-NEXT: mul x12, x12, x10 ; CHECK-GI-NEXT: sbc x1, x1, x9 -; CHECK-GI-NEXT: madd x8, x8, x11, x13 -; CHECK-GI-NEXT: subs x2, x2, x10 +; CHECK-GI-NEXT: madd x8, x8, x10, x13 +; CHECK-GI-NEXT: subs x2, x2, x12 ; CHECK-GI-NEXT: sbc x3, x3, x8 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll index 221e2fd..09e1fca 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll @@ -1200,7 +1200,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3 ; GFX6-NEXT: s_mov_b32 s5, s7 ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 -; GFX6-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc +; GFX6-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_cmpswap_i32_1d_no_return: @@ -1213,7 +1213,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3 ; GFX8-NEXT: s_mov_b32 s5, s7 ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 -; GFX8-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc +; GFX8-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm ; GFX8-NEXT: s_endpgm ; ; GFX900-LABEL: atomic_cmpswap_i32_1d_no_return: @@ -1226,7 +1226,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3 ; GFX900-NEXT: s_mov_b32 s5, s7 ; GFX900-NEXT: s_mov_b32 s6, s8 ; GFX900-NEXT: s_mov_b32 s7, s9 -; GFX900-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc +; GFX900-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm ; GFX900-NEXT: s_endpgm ; ; GFX90A-LABEL: atomic_cmpswap_i32_1d_no_return: @@ -1239,7 +1239,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3 ; GFX90A-NEXT: s_mov_b32 s5, s7 ; GFX90A-NEXT: s_mov_b32 s6, s8 ; GFX90A-NEXT: s_mov_b32 s7, s9 -; GFX90A-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc +; GFX90A-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm ; GFX90A-NEXT: s_endpgm ; ; GFX10PLUS-LABEL: atomic_cmpswap_i32_1d_no_return: @@ -1252,7 +1252,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3 ; GFX10PLUS-NEXT: s_mov_b32 s5, s7 ; GFX10PLUS-NEXT: s_mov_b32 s6, s8 ; GFX10PLUS-NEXT: s_mov_b32 s7, s9 -; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc +; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm ; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_cmpswap_i32_1d_no_return: @@ -1265,7 +1265,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3 ; GFX12-NEXT: s_mov_b32 s5, s7 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 -; GFX12-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN +; GFX12-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D ; GFX12-NEXT: s_endpgm main_body: %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -3194,7 +3194,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6 ; GFX6-NEXT: s_mov_b32 s5, s7 ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 -; GFX6-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc +; GFX6-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_cmpswap_i64_1d_no_return: @@ -3207,7 +3207,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6 ; GFX8-NEXT: s_mov_b32 s5, s7 ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 -; GFX8-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc +; GFX8-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm ; GFX8-NEXT: s_endpgm ; ; GFX900-LABEL: atomic_cmpswap_i64_1d_no_return: @@ -3220,7 +3220,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6 ; GFX900-NEXT: s_mov_b32 s5, s7 ; GFX900-NEXT: s_mov_b32 s6, s8 ; GFX900-NEXT: s_mov_b32 s7, s9 -; GFX900-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc +; GFX900-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm ; GFX900-NEXT: s_endpgm ; ; GFX90A-LABEL: atomic_cmpswap_i64_1d_no_return: @@ -3233,7 +3233,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6 ; GFX90A-NEXT: s_mov_b32 s5, s7 ; GFX90A-NEXT: s_mov_b32 s6, s8 ; GFX90A-NEXT: s_mov_b32 s7, s9 -; GFX90A-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc +; GFX90A-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm ; GFX90A-NEXT: s_endpgm ; ; GFX10PLUS-LABEL: atomic_cmpswap_i64_1d_no_return: @@ -3246,7 +3246,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6 ; GFX10PLUS-NEXT: s_mov_b32 s5, s7 ; GFX10PLUS-NEXT: s_mov_b32 s6, s8 ; GFX10PLUS-NEXT: s_mov_b32 s7, s9 -; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc +; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_cmpswap_i64_1d_no_return: @@ -3259,7 +3259,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6 ; GFX12-NEXT: s_mov_b32 s5, s7 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 -; GFX12-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN +; GFX12-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX12-NEXT: s_endpgm main_body: %v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir index 292fa4b..4f160b6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir @@ -25,6 +25,7 @@ body: | ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_si]].sub0 ; GFX6-NEXT: $vgpr0 = COPY [[COPY3]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; ; GFX8-LABEL: name: atomic_cmpswap_i32_1d ; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2 ; GFX8-NEXT: {{ $}} @@ -35,6 +36,7 @@ body: | ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_vi]].sub0 ; GFX8-NEXT: $vgpr0 = COPY [[COPY3]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; ; GFX10-LABEL: name: atomic_cmpswap_i32_1d ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} @@ -45,6 +47,7 @@ body: | ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10_]].sub0 ; GFX10-NEXT: $vgpr0 = COPY [[COPY3]] ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; ; GFX11-LABEL: name: atomic_cmpswap_i32_1d ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -55,6 +58,7 @@ body: | ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11_]].sub0 ; GFX11-NEXT: $vgpr0 = COPY [[COPY3]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; ; GFX12-LABEL: name: atomic_cmpswap_i32_1d ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2 ; GFX12-NEXT: {{ $}} @@ -89,39 +93,43 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX6-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_si:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_si [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) + ; GFX6-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_si [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) ; GFX6-NEXT: S_ENDPGM 0 + ; ; GFX8-LABEL: name: atomic_cmpswap_i32_1d_no_return ; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX8-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V1_V1_vi:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) + ; GFX8-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) ; GFX8-NEXT: S_ENDPGM 0 + ; ; GFX10-LABEL: name: atomic_cmpswap_i32_1d_no_return ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10_:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) + ; GFX10-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) ; GFX10-NEXT: S_ENDPGM 0 + ; ; GFX11-LABEL: name: atomic_cmpswap_i32_1d_no_return ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11_:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) + ; GFX11-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) ; GFX11-NEXT: S_ENDPGM 0 + ; ; GFX12-LABEL: name: atomic_cmpswap_i32_1d_no_return ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX12-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx12_:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) + ; GFX12-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) ; GFX12-NEXT: S_ENDPGM 0 %0:sgpr(<8 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 %1:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 @@ -150,6 +158,7 @@ body: | ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_si]].sub0_sub1 ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1 + ; ; GFX8-LABEL: name: atomic_cmpswap_i64_1d ; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; GFX8-NEXT: {{ $}} @@ -160,6 +169,7 @@ body: | ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_vi]].sub0_sub1 ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1 + ; ; GFX10-LABEL: name: atomic_cmpswap_i64_1d ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; GFX10-NEXT: {{ $}} @@ -170,6 +180,7 @@ body: | ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx10_]].sub0_sub1 ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]] ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1 + ; ; GFX11-LABEL: name: atomic_cmpswap_i64_1d ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; GFX11-NEXT: {{ $}} @@ -180,6 +191,7 @@ body: | ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx11_]].sub0_sub1 ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1 + ; ; GFX12-LABEL: name: atomic_cmpswap_i64_1d ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; GFX12-NEXT: {{ $}} @@ -214,39 +226,43 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX6-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_si:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_si [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) + ; GFX6-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_si [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) ; GFX6-NEXT: S_ENDPGM 0 + ; ; GFX8-LABEL: name: atomic_cmpswap_i64_1d_no_return ; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX8-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_vi:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) + ; GFX8-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) ; GFX8-NEXT: S_ENDPGM 0 + ; ; GFX10-LABEL: name: atomic_cmpswap_i64_1d_no_return ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX10-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx10_:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) + ; GFX10-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) ; GFX10-NEXT: S_ENDPGM 0 + ; ; GFX11-LABEL: name: atomic_cmpswap_i64_1d_no_return ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX11-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx11_:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) + ; GFX11-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) ; GFX11-NEXT: S_ENDPGM 0 + ; ; GFX12-LABEL: name: atomic_cmpswap_i64_1d_no_return ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX12-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12_:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) + ; GFX12-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) ; GFX12-NEXT: S_ENDPGM 0 %0:sgpr(<8 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 %1:vgpr(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll index 6c4f504..33ce278 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll @@ -23,7 +23,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero(ptr addrspace(1) ; PASS-CHECK: [[WHILE]]: ; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] ; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]]) ; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true +; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0 ; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]] ; PASS-CHECK: [[IF]]: ; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 @@ -75,7 +77,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero_swap_op(ptr addrs ; PASS-CHECK: [[WHILE]]: ; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] ; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]]) ; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true +; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 0, [[BALLOT]] ; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]] ; PASS-CHECK: [[IF]]: ; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 @@ -126,6 +130,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero(ptr addrspace(1) ; PASS-CHECK-NEXT: br label %[[WHILE:.*]] ; PASS-CHECK: [[WHILE]]: ; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[DONE]]) +; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp ne i64 0, [[BALLOT]] ; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]] ; PASS-CHECK: [[IF]]: ; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 @@ -175,6 +181,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero_swap(ptr addrspac ; PASS-CHECK-NEXT: br label %[[WHILE:.*]] ; PASS-CHECK: [[WHILE]]: ; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[DONE]]) +; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp ne i64 [[BALLOT]], 0 ; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]] ; PASS-CHECK: [[IF]]: ; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 @@ -225,7 +233,9 @@ define protected amdgpu_kernel void @trivial_uniform_waterfall(ptr addrspace(1) ; PASS-CHECK: [[WHILE]]: ; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ] ; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]]) ; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true +; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0 ; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]] ; PASS-CHECK: [[IF]]: ; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 0, 0 @@ -292,7 +302,9 @@ define protected amdgpu_kernel void @uniform_waterfall(ptr addrspace(1) %out, i3 ; PASS-CHECK: [[WHILE]]: ; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ] ; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]]) ; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true +; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0 ; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]] ; PASS-CHECK: [[IF]]: ; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[MYMASK]], [[MYMASK]] @@ -359,7 +371,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero_i32(ptr addrspace ; PASS-CHECK: [[WHILE]]: ; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] ; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[NOT_DONE]]) ; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true +; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i32 [[BALLOT]], 0 ; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]] ; PASS-CHECK: [[IF]]: ; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 @@ -410,6 +424,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero_i32(ptr addrspace ; PASS-CHECK-NEXT: br label %[[WHILE:.*]] ; PASS-CHECK: [[WHILE]]: ; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[DONE]]) +; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp ne i32 0, [[BALLOT]] ; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]] ; PASS-CHECK: [[IF]]: ; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll index aa11574..a3e42e5 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll @@ -595,6 +595,8 @@ define amdgpu_kernel void @ballot_i32(i32 %v, ptr addrspace(1) %out) { ; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i32( ; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { ; PASS-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1 +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[C]]) +; PASS-CHECK-NEXT: [[BALLOT_NE_ZERO:%.*]] = icmp ne i32 [[BALLOT]], 0 ; PASS-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1 ; PASS-CHECK-NEXT: ret void ; @@ -623,6 +625,8 @@ define amdgpu_kernel void @ballot_i64(i32 %v, ptr addrspace(1) %out) { ; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i64( ; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { ; PASS-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1 +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[C]]) +; PASS-CHECK-NEXT: [[BALLOT_NE_ZERO:%.*]] = icmp ne i64 [[BALLOT]], 0 ; PASS-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1 ; PASS-CHECK-NEXT: ret void ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll index 49607e3..83f0229 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll @@ -92,8 +92,7 @@ define amdgpu_ps void @atomic_swap_1d_agpr_noret(<8 x i32> inreg %rsrc, i32 %s) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a0 -; GFX90A-NEXT: image_atomic_swap v1, v0, s[0:7] dmask:0x1 unorm glc +; GFX90A-NEXT: image_atomic_swap a0, v0, s[0:7] dmask:0x1 unorm ; GFX90A-NEXT: s_endpgm %data = call i32 asm "; def $0", "=a"() %unused = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -106,8 +105,7 @@ define amdgpu_ps void @atomic_add_2d_agpr_noret(<8 x i32> inreg %rsrc, i32 %s, i ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: image_atomic_add v2, v[0:1], s[0:7] dmask:0x1 unorm glc +; GFX90A-NEXT: image_atomic_add a0, v[0:1], s[0:7] dmask:0x1 unorm ; GFX90A-NEXT: s_endpgm %data = call i32 asm "; def $0", "=a"() %unused = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -123,9 +121,7 @@ define amdgpu_ps void @atomic_cmpswap_1d_agpr_noret(<8 x i32> inreg %rsrc, i32 % ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a1 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: image_atomic_cmpswap v[2:3], v0, s[0:7] dmask:0x3 unorm glc +; GFX90A-NEXT: image_atomic_cmpswap a[0:1], v0, s[0:7] dmask:0x3 unorm ; GFX90A-NEXT: s_endpgm %cmp = call i32 asm "; def $0", "=a"() %swap = call i32 asm "; def $0", "=a"() @@ -139,9 +135,7 @@ define amdgpu_ps void @atomic_swap_1d_i64_agpr_noret(<8 x i32> inreg %rsrc, i32 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: image_atomic_swap v[2:3], v0, s[0:7] dmask:0x3 unorm glc +; GFX90A-NEXT: image_atomic_swap a[0:1], v0, s[0:7] dmask:0x3 unorm ; GFX90A-NEXT: s_endpgm %data = call i64 asm "; def $0", "=a"() %unused = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -154,14 +148,10 @@ define amdgpu_ps void @atomic_cmpswap_1d_64_agpr_noret(<8 x i32> inreg %rsrc, i3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ; def a[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX90A-NEXT: image_atomic_cmpswap v[2:5], v0, s[0:7] dmask:0xf unorm glc +; GFX90A-NEXT: image_atomic_cmpswap a[0:3], v0, s[0:7] dmask:0xf unorm ; GFX90A-NEXT: s_endpgm %cmp = call i64 asm "; def $0", "=a"() %swap = call i64 asm "; def $0", "=a"() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.noret.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.noret.ll new file mode 100644 index 0000000..6c58a1a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.noret.ll @@ -0,0 +1,581 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX10PLUS-GISE %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX10PLUS %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISE %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s + +define amdgpu_ps void @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_swap_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_swap_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_swap_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_swap_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_swap_1d_i64(<8 x i32> inreg %rsrc, i64 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_swap_1d_i64: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_swap_1d_i64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_swap_1d_i64: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_swap_1d_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_swap_1d_float(<8 x i32> inreg %rsrc, float %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_swap_1d_float: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_swap_1d_float: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_swap_1d_float: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_swap_1d_float: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call float @llvm.amdgcn.image.atomic.swap.1d.f32.i32(float %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_add_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_sub_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_sub_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_sub v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_sub_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_sub v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_sub_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_sub_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_sub_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_smin_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_smin_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_smin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_smin_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_smin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_smin_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_min_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_smin_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_min_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_umin_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_umin_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_umin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_umin_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_umin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_umin_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_min_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_min_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_smax_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_smax_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_smax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_smax_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_smax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_smax_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_max_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_smax_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_max_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_umax_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_umax_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_umax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_umax_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_umax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_umax_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_max_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_max_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_and_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_and_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_and_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_and_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_or_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_or_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_or_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_or_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_xor_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_xor_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_xor_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_xor_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_inc_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_inc_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_inc v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_inc_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_inc v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_inc_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_inc_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_inc_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_dec_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_dec_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_dec v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_dec_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_dec v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_dec_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_dec_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_dec_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_cmpswap_1d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %swap, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_cmpswap_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_cmpswap_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_cmpswap_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpswap_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_cmpswap_1d_64(<8 x i32> inreg %rsrc, i64 %cmp, i64 %swap, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_cmpswap_1d_64: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_cmpswap_1d_64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_cmpswap_1d_64: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpswap_1d_64: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_2d(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t) { +; GFX10PLUS-GISE-LABEL: atomic_add_2d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_2d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_2d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_2d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_3d(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %r) { +; GFX10PLUS-GISE-LABEL: atomic_add_3d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_3d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_3d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_3d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.3d.i32.i32(i32 %data, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_cube(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %face) { +; GFX10PLUS-GISE-LABEL: atomic_add_cube: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_cube: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_cube: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_cube: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i32(i32 %data, i32 %s, i32 %t, i32 %face, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_1darray(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %slice) { +; GFX10PLUS-GISE-LABEL: atomic_add_1darray: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_1darray: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_1darray: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_1darray: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i32(i32 %data, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_2darray(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %slice) { +; GFX10PLUS-GISE-LABEL: atomic_add_2darray: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_2darray: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_2darray: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_2darray: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %fragid) { +; GFX10PLUS-GISE-LABEL: atomic_add_2dmsaa: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_2dmsaa: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_2dmsaa: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_2dmsaa: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_2darraymsaa(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid) { +; GFX10PLUS-GISE-LABEL: atomic_add_2darraymsaa: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_2darraymsaa: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_2darraymsaa: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2, v3, v4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_2darraymsaa: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2, v3, v4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_1d_slc(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_add_1d_slc: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm slc +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_1d_slc: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm slc +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_1d_slc: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_1d_slc: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 2) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll index 3d1d6c8..0ba62e4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll @@ -41,15 +41,13 @@ main_body: define amdgpu_ps float @atomic_pk_add_f16_1d_v2_noret(<8 x i32> inreg %rsrc, <2 x half> %data, i32 %s) { ; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v2_noret: ; GFX12-SDAG: ; %bb.0: ; %main_body -; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: atomic_pk_add_f16_1d_v2_noret: ; GFX12-GISEL: ; %bb.0: ; %main_body -; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: @@ -79,15 +77,13 @@ main_body: define amdgpu_ps float @atomic_pk_add_f16_1d_v4_noret(<8 x i32> inreg %rsrc, <4 x half> %data, i32 %s) { ; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v4_noret: ; GFX12-SDAG: ; %bb.0: ; %main_body -; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: atomic_pk_add_f16_1d_v4_noret: ; GFX12-GISEL: ; %bb.0: ; %main_body -; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: @@ -126,15 +122,13 @@ main_body: define amdgpu_ps float @atomic_pk_add_bf16_1d_v2_noret(<8 x i32> inreg %rsrc, <2 x bfloat> %data, i32 %s) { ; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v2_noret: ; GFX12-SDAG: ; %bb.0: ; %main_body -; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v2_noret: ; GFX12-GISEL: ; %bb.0: ; %main_body -; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: @@ -173,15 +167,13 @@ main_body: define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_noret(<8 x i32> inreg %rsrc, <4 x bfloat> %data, i32 %s) { ; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v4_noret: ; GFX12-SDAG: ; %bb.0: ; %main_body -; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4_noret: ; GFX12-GISEL: ; %bb.0: ; %main_body -; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: @@ -192,15 +184,13 @@ main_body: define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_nt(<8 x i32> inreg %rsrc, <4 x bfloat> %data, i32 %s) { ; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v4_nt: ; GFX12-SDAG: ; %bb.0: ; %main_body -; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4_nt: ; GFX12-GISEL: ; %bb.0: ; %main_body -; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/wave-ops.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/wave-ops.ll index 7a876f6..3544017 100644 --- a/llvm/test/CodeGen/DirectX/ShaderFlags/wave-ops.ll +++ b/llvm/test/CodeGen/DirectX/ShaderFlags/wave-ops.ll @@ -76,6 +76,20 @@ entry: ret i32 %ret } +define noundef i32 @wave_reduce_min(i32 noundef %x) { +entry: + ; CHECK: Function wave_reduce_min : [[WAVE_FLAG]] + %ret = call i32 @llvm.dx.wave.reduce.min.i32(i32 %x) + ret i32 %ret +} + +define noundef i32 @wave_reduce_umin(i32 noundef %x) { +entry: + ; CHECK: Function wave_reduce_umin : [[WAVE_FLAG]] + %ret = call i32 @llvm.dx.wave.reduce.umin.i32(i32 %x) + ret i32 %ret +} + define void @wave_active_countbits(i1 %expr) { entry: ; CHECK: Function wave_active_countbits : [[WAVE_FLAG]] diff --git a/llvm/test/CodeGen/DirectX/WaveActiveMin.ll b/llvm/test/CodeGen/DirectX/WaveActiveMin.ll new file mode 100644 index 0000000..24fde48 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/WaveActiveMin.ll @@ -0,0 +1,143 @@ +; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library < %s | FileCheck %s + +; Test that for scalar values, WaveActiveMin maps down to the DirectX op + +define noundef half @wave_active_min_half(half noundef %expr) { +entry: +; CHECK: call half @dx.op.waveActiveOp.f16(i32 119, half %expr, i8 2, i8 0){{$}} + %ret = call half @llvm.dx.wave.reduce.min.f16(half %expr) + ret half %ret +} + +define noundef float @wave_active_min_float(float noundef %expr) { +entry: +; CHECK: call float @dx.op.waveActiveOp.f32(i32 119, float %expr, i8 2, i8 0){{$}} + %ret = call float @llvm.dx.wave.reduce.min.f32(float %expr) + ret float %ret +} + +define noundef double @wave_active_min_double(double noundef %expr) { +entry: +; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr, i8 2, i8 0){{$}} + %ret = call double @llvm.dx.wave.reduce.min.f64(double %expr) + ret double %ret +} + +define noundef i16 @wave_active_min_i16(i16 noundef %expr) { +entry: +; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr, i8 2, i8 0){{$}} + %ret = call i16 @llvm.dx.wave.reduce.min.i16(i16 %expr) + ret i16 %ret +} + +define noundef i32 @wave_active_min_i32(i32 noundef %expr) { +entry: +; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr, i8 2, i8 0){{$}} + %ret = call i32 @llvm.dx.wave.reduce.min.i32(i32 %expr) + ret i32 %ret +} + +define noundef i64 @wave_active_min_i64(i64 noundef %expr) { +entry: +; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr, i8 2, i8 0){{$}} + %ret = call i64 @llvm.dx.wave.reduce.min.i64(i64 %expr) + ret i64 %ret +} + +define noundef i16 @wave_active_umin_i16(i16 noundef %expr) { +entry: +; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr, i8 2, i8 1){{$}} + %ret = call i16 @llvm.dx.wave.reduce.umin.i16(i16 %expr) + ret i16 %ret +} + +define noundef i32 @wave_active_umin_i32(i32 noundef %expr) { +entry: +; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr, i8 2, i8 1){{$}} + %ret = call i32 @llvm.dx.wave.reduce.umin.i32(i32 %expr) + ret i32 %ret +} + +define noundef i64 @wave_active_umin_i64(i64 noundef %expr) { +entry: +; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr, i8 2, i8 1){{$}} + %ret = call i64 @llvm.dx.wave.reduce.umin.i64(i64 %expr) + ret i64 %ret +} + +declare half @llvm.dx.wave.reduce.min.f16(half) +declare float @llvm.dx.wave.reduce.min.f32(float) +declare double @llvm.dx.wave.reduce.min.f64(double) + +declare i16 @llvm.dx.wave.reduce.min.i16(i16) +declare i32 @llvm.dx.wave.reduce.min.i32(i32) +declare i64 @llvm.dx.wave.reduce.min.i64(i64) + +declare i16 @llvm.dx.wave.reduce.umin.i16(i16) +declare i32 @llvm.dx.wave.reduce.umin.i32(i32) +declare i64 @llvm.dx.wave.reduce.umin.i64(i64) + +; Test that for vector values, WaveActiveMin scalarizes and maps down to the +; DirectX op + +define noundef <2 x half> @wave_active_min_v2half(<2 x half> noundef %expr) { +entry: +; CHECK: call half @dx.op.waveActiveOp.f16(i32 119, half %expr.i0, i8 2, i8 0){{$}} +; CHECK: call half @dx.op.waveActiveOp.f16(i32 119, half %expr.i1, i8 2, i8 0){{$}} + %ret = call <2 x half> @llvm.dx.wave.reduce.min.v2f16(<2 x half> %expr) + ret <2 x half> %ret +} + +define noundef <3 x i32> @wave_active_min_v3i32(<3 x i32> noundef %expr) { +entry: +; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i0, i8 2, i8 0){{$}} +; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i1, i8 2, i8 0){{$}} +; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i2, i8 2, i8 0){{$}} + %ret = call <3 x i32> @llvm.dx.wave.reduce.min.v3i32(<3 x i32> %expr) + ret <3 x i32> %ret +} + +define noundef <4 x double> @wave_active_min_v4f64(<4 x double> noundef %expr) { +entry: +; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i0, i8 2, i8 0){{$}} +; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i1, i8 2, i8 0){{$}} +; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i2, i8 2, i8 0){{$}} +; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i3, i8 2, i8 0){{$}} + %ret = call <4 x double> @llvm.dx.wave.reduce.min.v4f64(<4 x double> %expr) + ret <4 x double> %ret +} + +declare <2 x half> @llvm.dx.wave.reduce.min.v2f16(<2 x half>) +declare <3 x i32> @llvm.dx.wave.reduce.min.v3i32(<3 x i32>) +declare <4 x double> @llvm.dx.wave.reduce.min.v4f64(<4 x double>) + +define noundef <2 x i16> @wave_active_umin_v2i16(<2 x i16> noundef %expr) { +entry: +; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr.i0, i8 2, i8 1){{$}} +; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr.i1, i8 2, i8 1){{$}} + %ret = call <2 x i16> @llvm.dx.wave.reduce.umin.v2f16(<2 x i16> %expr) + ret <2 x i16> %ret +} + +define noundef <3 x i32> @wave_active_umin_v3i32(<3 x i32> noundef %expr) { +entry: +; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i0, i8 2, i8 1){{$}} +; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i1, i8 2, i8 1){{$}} +; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i2, i8 2, i8 1){{$}} + %ret = call <3 x i32> @llvm.dx.wave.reduce.umin.v3i32(<3 x i32> %expr) + ret <3 x i32> %ret +} + +define noundef <4 x i64> @wave_active_umin_v4f64(<4 x i64> noundef %expr) { +entry: +; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i0, i8 2, i8 1){{$}} +; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i1, i8 2, i8 1){{$}} +; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i2, i8 2, i8 1){{$}} +; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i3, i8 2, i8 1){{$}} + %ret = call <4 x i64> @llvm.dx.wave.reduce.umin.v4f64(<4 x i64> %expr) + ret <4 x i64> %ret +} + +declare <2 x i16> @llvm.dx.wave.reduce.umin.v2f16(<2 x i16>) +declare <3 x i32> @llvm.dx.wave.reduce.umin.v3i32(<3 x i32>) +declare <4 x i64> @llvm.dx.wave.reduce.umin.v4f64(<4 x i64>) diff --git a/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll b/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll new file mode 100644 index 0000000..48ec98c --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll @@ -0,0 +1,160 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s + +define void @minnum_v8f32(ptr %res, ptr %x, ptr %y) nounwind { +; CHECK-LABEL: minnum_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvld $xr1, $a1, 0 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 5 +; CHECK-NEXT: xvpickve.w $xr3, $xr1, 5 +; CHECK-NEXT: fmin.s $fa2, $fa3, $fa2 +; CHECK-NEXT: xvpickve.w $xr3, $xr0, 4 +; CHECK-NEXT: xvpickve.w $xr4, $xr1, 4 +; CHECK-NEXT: fmin.s $fa3, $fa4, $fa3 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 16 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 6 +; CHECK-NEXT: xvpickve.w $xr4, $xr1, 6 +; CHECK-NEXT: fmin.s $fa2, $fa4, $fa2 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 32 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 7 +; CHECK-NEXT: xvpickve.w $xr4, $xr1, 7 +; CHECK-NEXT: fmin.s $fa2, $fa4, $fa2 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 48 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 1 +; CHECK-NEXT: xvpickve.w $xr4, $xr1, 1 +; CHECK-NEXT: fmin.s $fa2, $fa4, $fa2 +; CHECK-NEXT: xvpickve.w $xr4, $xr0, 0 +; CHECK-NEXT: xvpickve.w $xr5, $xr1, 0 +; CHECK-NEXT: fmin.s $fa4, $fa5, $fa4 +; CHECK-NEXT: vextrins.w $vr4, $vr2, 16 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 2 +; CHECK-NEXT: xvpickve.w $xr5, $xr1, 2 +; CHECK-NEXT: fmin.s $fa2, $fa5, $fa2 +; CHECK-NEXT: vextrins.w $vr4, $vr2, 32 +; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3 +; CHECK-NEXT: xvpickve.w $xr1, $xr1, 3 +; CHECK-NEXT: fmin.s $fa0, $fa1, $fa0 +; CHECK-NEXT: vextrins.w $vr4, $vr0, 48 +; CHECK-NEXT: xvpermi.q $xr4, $xr3, 2 +; CHECK-NEXT: xvst $xr4, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <8 x float>, ptr %x + %v1 = load <8 x float>, ptr %y + %r = call <8 x float> @llvm.minnum.v8f32(<8 x float> %v0, <8 x float> %v1) + store <8 x float> %r, ptr %res + ret void +} + +define void @minnum_v4f64(ptr %res, ptr %x, ptr %y) nounwind { +; CHECK-LABEL: minnum_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvld $xr1, $a1, 0 +; CHECK-NEXT: xvpickve.d $xr2, $xr0, 3 +; CHECK-NEXT: xvpickve.d $xr3, $xr1, 3 +; CHECK-NEXT: fmin.d $fa2, $fa3, $fa2 +; CHECK-NEXT: xvpickve.d $xr3, $xr0, 2 +; CHECK-NEXT: xvpickve.d $xr4, $xr1, 2 +; CHECK-NEXT: fmin.d $fa3, $fa4, $fa3 +; CHECK-NEXT: vextrins.d $vr3, $vr2, 16 +; CHECK-NEXT: xvpickve.d $xr2, $xr0, 1 +; CHECK-NEXT: xvpickve.d $xr4, $xr1, 1 +; CHECK-NEXT: fmin.d $fa2, $fa4, $fa2 +; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0 +; CHECK-NEXT: xvpickve.d $xr1, $xr1, 0 +; CHECK-NEXT: fmin.d $fa0, $fa1, $fa0 +; CHECK-NEXT: vextrins.d $vr0, $vr2, 16 +; CHECK-NEXT: xvpermi.q $xr0, $xr3, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x double>, ptr %x + %v1 = load <4 x double>, ptr %y + %r = call <4 x double> @llvm.minnum.v4f64(<4 x double> %v0, <4 x double> %v1) + store <4 x double> %r, ptr %res + ret void +} + +define void @maxnum_v8f32(ptr %res, ptr %x, ptr %y) nounwind { +; CHECK-LABEL: maxnum_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvld $xr1, $a1, 0 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 5 +; CHECK-NEXT: xvpickve.w $xr3, $xr1, 5 +; CHECK-NEXT: fmax.s $fa2, $fa3, $fa2 +; CHECK-NEXT: xvpickve.w $xr3, $xr0, 4 +; CHECK-NEXT: xvpickve.w $xr4, $xr1, 4 +; CHECK-NEXT: fmax.s $fa3, $fa4, $fa3 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 16 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 6 +; CHECK-NEXT: xvpickve.w $xr4, $xr1, 6 +; CHECK-NEXT: fmax.s $fa2, $fa4, $fa2 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 32 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 7 +; CHECK-NEXT: xvpickve.w $xr4, $xr1, 7 +; CHECK-NEXT: fmax.s $fa2, $fa4, $fa2 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 48 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 1 +; CHECK-NEXT: xvpickve.w $xr4, $xr1, 1 +; CHECK-NEXT: fmax.s $fa2, $fa4, $fa2 +; CHECK-NEXT: xvpickve.w $xr4, $xr0, 0 +; CHECK-NEXT: xvpickve.w $xr5, $xr1, 0 +; CHECK-NEXT: fmax.s $fa4, $fa5, $fa4 +; CHECK-NEXT: vextrins.w $vr4, $vr2, 16 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 2 +; CHECK-NEXT: xvpickve.w $xr5, $xr1, 2 +; CHECK-NEXT: fmax.s $fa2, $fa5, $fa2 +; CHECK-NEXT: vextrins.w $vr4, $vr2, 32 +; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3 +; CHECK-NEXT: xvpickve.w $xr1, $xr1, 3 +; CHECK-NEXT: fmax.s $fa0, $fa1, $fa0 +; CHECK-NEXT: vextrins.w $vr4, $vr0, 48 +; CHECK-NEXT: xvpermi.q $xr4, $xr3, 2 +; CHECK-NEXT: xvst $xr4, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <8 x float>, ptr %x + %v1 = load <8 x float>, ptr %y + %r = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %v0, <8 x float> %v1) + store <8 x float> %r, ptr %res + ret void +} + +define void @maxnum_v4f64(ptr %res, ptr %x, ptr %y) nounwind { +; CHECK-LABEL: maxnum_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvld $xr1, $a1, 0 +; CHECK-NEXT: xvpickve.d $xr2, $xr0, 3 +; CHECK-NEXT: xvpickve.d $xr3, $xr1, 3 +; CHECK-NEXT: fmax.d $fa2, $fa3, $fa2 +; CHECK-NEXT: xvpickve.d $xr3, $xr0, 2 +; CHECK-NEXT: xvpickve.d $xr4, $xr1, 2 +; CHECK-NEXT: fmax.d $fa3, $fa4, $fa3 +; CHECK-NEXT: vextrins.d $vr3, $vr2, 16 +; CHECK-NEXT: xvpickve.d $xr2, $xr0, 1 +; CHECK-NEXT: xvpickve.d $xr4, $xr1, 1 +; CHECK-NEXT: fmax.d $fa2, $fa4, $fa2 +; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0 +; CHECK-NEXT: xvpickve.d $xr1, $xr1, 0 +; CHECK-NEXT: fmax.d $fa0, $fa1, $fa0 +; CHECK-NEXT: vextrins.d $vr0, $vr2, 16 +; CHECK-NEXT: xvpermi.q $xr0, $xr3, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x double>, ptr %x + %v1 = load <4 x double>, ptr %y + %r = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %v0, <4 x double> %v1) + store <4 x double> %r, ptr %res + ret void +} + +declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>) +declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>) +declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>) +declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll b/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll new file mode 100644 index 0000000..27ecb75 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll @@ -0,0 +1,112 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s + +define void @minnum_v4f32(ptr %res, ptr %x, ptr %y) nounwind { +; CHECK-LABEL: minnum_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a2, 0 +; CHECK-NEXT: vld $vr1, $a1, 0 +; CHECK-NEXT: vreplvei.w $vr2, $vr0, 1 +; CHECK-NEXT: vreplvei.w $vr3, $vr1, 1 +; CHECK-NEXT: fmin.s $fa2, $fa3, $fa2 +; CHECK-NEXT: vreplvei.w $vr3, $vr0, 0 +; CHECK-NEXT: vreplvei.w $vr4, $vr1, 0 +; CHECK-NEXT: fmin.s $fa3, $fa4, $fa3 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 16 +; CHECK-NEXT: vreplvei.w $vr2, $vr0, 2 +; CHECK-NEXT: vreplvei.w $vr4, $vr1, 2 +; CHECK-NEXT: fmin.s $fa2, $fa4, $fa2 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 32 +; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 3 +; CHECK-NEXT: fmin.s $fa0, $fa1, $fa0 +; CHECK-NEXT: vextrins.w $vr3, $vr0, 48 +; CHECK-NEXT: vst $vr3, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x float>, ptr %x + %v1 = load <4 x float>, ptr %y + %r = call <4 x float> @llvm.minnum.v4f32(<4 x float> %v0, <4 x float> %v1) + store <4 x float> %r, ptr %res + ret void +} + +define void @minnum_v2f64(ptr %res, ptr %x, ptr %y) nounwind { +; CHECK-LABEL: minnum_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a2, 0 +; CHECK-NEXT: vld $vr1, $a1, 0 +; CHECK-NEXT: vreplvei.d $vr2, $vr0, 1 +; CHECK-NEXT: vreplvei.d $vr3, $vr1, 1 +; CHECK-NEXT: fmin.d $fa2, $fa3, $fa2 +; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 +; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 +; CHECK-NEXT: fmin.d $fa0, $fa1, $fa0 +; CHECK-NEXT: vextrins.d $vr0, $vr2, 16 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <2 x double>, ptr %x + %v1 = load <2 x double>, ptr %y + %r = call <2 x double> @llvm.minnum.v2f64(<2 x double> %v0, <2 x double> %v1) + store <2 x double> %r, ptr %res + ret void +} + +define void @maxnum_v4f32(ptr %res, ptr %x, ptr %y) nounwind { +; CHECK-LABEL: maxnum_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a2, 0 +; CHECK-NEXT: vld $vr1, $a1, 0 +; CHECK-NEXT: vreplvei.w $vr2, $vr0, 1 +; CHECK-NEXT: vreplvei.w $vr3, $vr1, 1 +; CHECK-NEXT: fmax.s $fa2, $fa3, $fa2 +; CHECK-NEXT: vreplvei.w $vr3, $vr0, 0 +; CHECK-NEXT: vreplvei.w $vr4, $vr1, 0 +; CHECK-NEXT: fmax.s $fa3, $fa4, $fa3 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 16 +; CHECK-NEXT: vreplvei.w $vr2, $vr0, 2 +; CHECK-NEXT: vreplvei.w $vr4, $vr1, 2 +; CHECK-NEXT: fmax.s $fa2, $fa4, $fa2 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 32 +; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 3 +; CHECK-NEXT: fmax.s $fa0, $fa1, $fa0 +; CHECK-NEXT: vextrins.w $vr3, $vr0, 48 +; CHECK-NEXT: vst $vr3, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x float>, ptr %x + %v1 = load <4 x float>, ptr %y + %r = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %v0, <4 x float> %v1) + store <4 x float> %r, ptr %res + ret void +} + +define void @maxnum_v2f64(ptr %res, ptr %x, ptr %y) nounwind { +; CHECK-LABEL: maxnum_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a2, 0 +; CHECK-NEXT: vld $vr1, $a1, 0 +; CHECK-NEXT: vreplvei.d $vr2, $vr0, 1 +; CHECK-NEXT: vreplvei.d $vr3, $vr1, 1 +; CHECK-NEXT: fmax.d $fa2, $fa3, $fa2 +; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 +; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 +; CHECK-NEXT: fmax.d $fa0, $fa1, $fa0 +; CHECK-NEXT: vextrins.d $vr0, $vr2, 16 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <2 x double>, ptr %x + %v1 = load <2 x double>, ptr %y + %r = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %v0, <2 x double> %v1) + store <2 x double> %r, ptr %res + ret void +} + +declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) +declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>) diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw-minmax.ll b/llvm/test/CodeGen/RISCV/atomic-rmw-minmax.ll new file mode 100644 index 0000000..b43555c6 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/atomic-rmw-minmax.ll @@ -0,0 +1,642 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+b,+zalrsc -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=RV32IB-COMMON,RV32IB-ZALRSC %s +; RUN: llc -mtriple=riscv32 -mattr=+b,+zalrsc,+permissive-zalrsc -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=RV32IB-COMMON,RV32IB-ZALRSC-PERM %s +; RUN: llc -mtriple=riscv32 -mattr=+b,+zalrsc,+permissive-zalrsc,+a -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=RV32IB-COMMON,RV32IAB %s +; +; RUN: llc -mtriple=riscv64 -mattr=+b,+zalrsc -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=RV64IB-ZALRSC %s +; RUN: llc -mtriple=riscv64 -mattr=+b,+zalrsc,+permissive-zalrsc -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=RV64IB-ZALRSC-PERM %s +; RUN: llc -mtriple=riscv64 -mattr=+b,+zalrsc,+permissive-zalrsc,+a -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=RV64IAB %s + +define i32 @atomicrmw_max_i32_seq_cst(ptr %a, i32 %b) nounwind { +; RV32IB-ZALRSC-LABEL: atomicrmw_max_i32_seq_cst: +; RV32IB-ZALRSC: # %bb.0: +; RV32IB-ZALRSC-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 +; RV32IB-ZALRSC-NEXT: lr.w.aqrl a2, (a0) +; RV32IB-ZALRSC-NEXT: mv a3, a2 +; RV32IB-ZALRSC-NEXT: bge a3, a1, .LBB0_3 +; RV32IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB0_1 Depth=1 +; RV32IB-ZALRSC-NEXT: mv a3, a1 +; RV32IB-ZALRSC-NEXT: .LBB0_3: # in Loop: Header=BB0_1 Depth=1 +; RV32IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0) +; RV32IB-ZALRSC-NEXT: bnez a3, .LBB0_1 +; RV32IB-ZALRSC-NEXT: # %bb.4: +; RV32IB-ZALRSC-NEXT: mv a0, a2 +; RV32IB-ZALRSC-NEXT: ret +; +; RV32IB-ZALRSC-PERM-LABEL: atomicrmw_max_i32_seq_cst: +; RV32IB-ZALRSC-PERM: # %bb.0: +; RV32IB-ZALRSC-PERM-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 +; RV32IB-ZALRSC-PERM-NEXT: lr.w.aqrl a2, (a0) +; RV32IB-ZALRSC-PERM-NEXT: max a3, a2, a1 +; RV32IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0) +; RV32IB-ZALRSC-PERM-NEXT: bnez a3, .LBB0_1 +; RV32IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV32IB-ZALRSC-PERM-NEXT: mv a0, a2 +; RV32IB-ZALRSC-PERM-NEXT: ret +; +; RV32IAB-LABEL: atomicrmw_max_i32_seq_cst: +; RV32IAB: # %bb.0: +; RV32IAB-NEXT: amomax.w.aqrl a0, a1, (a0) +; RV32IAB-NEXT: ret +; +; RV64IB-ZALRSC-LABEL: atomicrmw_max_i32_seq_cst: +; RV64IB-ZALRSC: # %bb.0: +; RV64IB-ZALRSC-NEXT: sext.w a2, a1 +; RV64IB-ZALRSC-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-NEXT: lr.w.aqrl a1, (a0) +; RV64IB-ZALRSC-NEXT: mv a3, a1 +; RV64IB-ZALRSC-NEXT: bge a3, a2, .LBB0_3 +; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB0_1 Depth=1 +; RV64IB-ZALRSC-NEXT: mv a3, a2 +; RV64IB-ZALRSC-NEXT: .LBB0_3: # in Loop: Header=BB0_1 Depth=1 +; RV64IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0) +; RV64IB-ZALRSC-NEXT: bnez a3, .LBB0_1 +; RV64IB-ZALRSC-NEXT: # %bb.4: +; RV64IB-ZALRSC-NEXT: mv a0, a1 +; RV64IB-ZALRSC-NEXT: ret +; +; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_max_i32_seq_cst: +; RV64IB-ZALRSC-PERM: # %bb.0: +; RV64IB-ZALRSC-PERM-NEXT: sext.w a2, a1 +; RV64IB-ZALRSC-PERM-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-PERM-NEXT: lr.w.aqrl a1, (a0) +; RV64IB-ZALRSC-PERM-NEXT: max a3, a1, a2 +; RV64IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0) +; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB0_1 +; RV64IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV64IB-ZALRSC-PERM-NEXT: mv a0, a1 +; RV64IB-ZALRSC-PERM-NEXT: ret +; +; RV64IAB-LABEL: atomicrmw_max_i32_seq_cst: +; RV64IAB: # %bb.0: +; RV64IAB-NEXT: amomax.w.aqrl a0, a1, (a0) +; RV64IAB-NEXT: ret + %1 = atomicrmw max ptr %a, i32 %b seq_cst + ret i32 %1 +} + +define i32 @atomicrmw_min_i32_seq_cst(ptr %a, i32 %b) nounwind { +; RV32IB-ZALRSC-LABEL: atomicrmw_min_i32_seq_cst: +; RV32IB-ZALRSC: # %bb.0: +; RV32IB-ZALRSC-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 +; RV32IB-ZALRSC-NEXT: lr.w.aqrl a2, (a0) +; RV32IB-ZALRSC-NEXT: mv a3, a2 +; RV32IB-ZALRSC-NEXT: bge a1, a3, .LBB1_3 +; RV32IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB1_1 Depth=1 +; RV32IB-ZALRSC-NEXT: mv a3, a1 +; RV32IB-ZALRSC-NEXT: .LBB1_3: # in Loop: Header=BB1_1 Depth=1 +; RV32IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0) +; RV32IB-ZALRSC-NEXT: bnez a3, .LBB1_1 +; RV32IB-ZALRSC-NEXT: # %bb.4: +; RV32IB-ZALRSC-NEXT: mv a0, a2 +; RV32IB-ZALRSC-NEXT: ret +; +; RV32IB-ZALRSC-PERM-LABEL: atomicrmw_min_i32_seq_cst: +; RV32IB-ZALRSC-PERM: # %bb.0: +; RV32IB-ZALRSC-PERM-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 +; RV32IB-ZALRSC-PERM-NEXT: lr.w.aqrl a2, (a0) +; RV32IB-ZALRSC-PERM-NEXT: min a3, a2, a1 +; RV32IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0) +; RV32IB-ZALRSC-PERM-NEXT: bnez a3, .LBB1_1 +; RV32IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV32IB-ZALRSC-PERM-NEXT: mv a0, a2 +; RV32IB-ZALRSC-PERM-NEXT: ret +; +; RV32IAB-LABEL: atomicrmw_min_i32_seq_cst: +; RV32IAB: # %bb.0: +; RV32IAB-NEXT: amomin.w.aqrl a0, a1, (a0) +; RV32IAB-NEXT: ret +; +; RV64IB-ZALRSC-LABEL: atomicrmw_min_i32_seq_cst: +; RV64IB-ZALRSC: # %bb.0: +; RV64IB-ZALRSC-NEXT: sext.w a2, a1 +; RV64IB-ZALRSC-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-NEXT: lr.w.aqrl a1, (a0) +; RV64IB-ZALRSC-NEXT: mv a3, a1 +; RV64IB-ZALRSC-NEXT: bge a2, a3, .LBB1_3 +; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB1_1 Depth=1 +; RV64IB-ZALRSC-NEXT: mv a3, a2 +; RV64IB-ZALRSC-NEXT: .LBB1_3: # in Loop: Header=BB1_1 Depth=1 +; RV64IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0) +; RV64IB-ZALRSC-NEXT: bnez a3, .LBB1_1 +; RV64IB-ZALRSC-NEXT: # %bb.4: +; RV64IB-ZALRSC-NEXT: mv a0, a1 +; RV64IB-ZALRSC-NEXT: ret +; +; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_min_i32_seq_cst: +; RV64IB-ZALRSC-PERM: # %bb.0: +; RV64IB-ZALRSC-PERM-NEXT: sext.w a2, a1 +; RV64IB-ZALRSC-PERM-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-PERM-NEXT: lr.w.aqrl a1, (a0) +; RV64IB-ZALRSC-PERM-NEXT: min a3, a1, a2 +; RV64IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0) +; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB1_1 +; RV64IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV64IB-ZALRSC-PERM-NEXT: mv a0, a1 +; RV64IB-ZALRSC-PERM-NEXT: ret +; +; RV64IAB-LABEL: atomicrmw_min_i32_seq_cst: +; RV64IAB: # %bb.0: +; RV64IAB-NEXT: amomin.w.aqrl a0, a1, (a0) +; RV64IAB-NEXT: ret + %1 = atomicrmw min ptr %a, i32 %b seq_cst + ret i32 %1 +} + +define i32 @atomicrmw_umax_i32_seq_cst(ptr %a, i32 %b) nounwind { +; RV32IB-ZALRSC-LABEL: atomicrmw_umax_i32_seq_cst: +; RV32IB-ZALRSC: # %bb.0: +; RV32IB-ZALRSC-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 +; RV32IB-ZALRSC-NEXT: lr.w.aqrl a2, (a0) +; RV32IB-ZALRSC-NEXT: mv a3, a2 +; RV32IB-ZALRSC-NEXT: bgeu a3, a1, .LBB2_3 +; RV32IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB2_1 Depth=1 +; RV32IB-ZALRSC-NEXT: mv a3, a1 +; RV32IB-ZALRSC-NEXT: .LBB2_3: # in Loop: Header=BB2_1 Depth=1 +; RV32IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0) +; RV32IB-ZALRSC-NEXT: bnez a3, .LBB2_1 +; RV32IB-ZALRSC-NEXT: # %bb.4: +; RV32IB-ZALRSC-NEXT: mv a0, a2 +; RV32IB-ZALRSC-NEXT: ret +; +; RV32IB-ZALRSC-PERM-LABEL: atomicrmw_umax_i32_seq_cst: +; RV32IB-ZALRSC-PERM: # %bb.0: +; RV32IB-ZALRSC-PERM-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 +; RV32IB-ZALRSC-PERM-NEXT: lr.w.aqrl a2, (a0) +; RV32IB-ZALRSC-PERM-NEXT: maxu a3, a2, a1 +; RV32IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0) +; RV32IB-ZALRSC-PERM-NEXT: bnez a3, .LBB2_1 +; RV32IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV32IB-ZALRSC-PERM-NEXT: mv a0, a2 +; RV32IB-ZALRSC-PERM-NEXT: ret +; +; RV32IAB-LABEL: atomicrmw_umax_i32_seq_cst: +; RV32IAB: # %bb.0: +; RV32IAB-NEXT: amomaxu.w.aqrl a0, a1, (a0) +; RV32IAB-NEXT: ret +; +; RV64IB-ZALRSC-LABEL: atomicrmw_umax_i32_seq_cst: +; RV64IB-ZALRSC: # %bb.0: +; RV64IB-ZALRSC-NEXT: sext.w a2, a1 +; RV64IB-ZALRSC-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-NEXT: lr.w.aqrl a1, (a0) +; RV64IB-ZALRSC-NEXT: mv a3, a1 +; RV64IB-ZALRSC-NEXT: bgeu a3, a2, .LBB2_3 +; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB2_1 Depth=1 +; RV64IB-ZALRSC-NEXT: mv a3, a2 +; RV64IB-ZALRSC-NEXT: .LBB2_3: # in Loop: Header=BB2_1 Depth=1 +; RV64IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0) +; RV64IB-ZALRSC-NEXT: bnez a3, .LBB2_1 +; RV64IB-ZALRSC-NEXT: # %bb.4: +; RV64IB-ZALRSC-NEXT: mv a0, a1 +; RV64IB-ZALRSC-NEXT: ret +; +; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_umax_i32_seq_cst: +; RV64IB-ZALRSC-PERM: # %bb.0: +; RV64IB-ZALRSC-PERM-NEXT: sext.w a2, a1 +; RV64IB-ZALRSC-PERM-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-PERM-NEXT: lr.w.aqrl a1, (a0) +; RV64IB-ZALRSC-PERM-NEXT: maxu a3, a1, a2 +; RV64IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0) +; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB2_1 +; RV64IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV64IB-ZALRSC-PERM-NEXT: mv a0, a1 +; RV64IB-ZALRSC-PERM-NEXT: ret +; +; RV64IAB-LABEL: atomicrmw_umax_i32_seq_cst: +; RV64IAB: # %bb.0: +; RV64IAB-NEXT: amomaxu.w.aqrl a0, a1, (a0) +; RV64IAB-NEXT: ret + %1 = atomicrmw umax ptr %a, i32 %b seq_cst + ret i32 %1 +} + +define i32 @atomicrmw_umin_i32_seq_cst(ptr %a, i32 %b) nounwind { +; RV32IB-ZALRSC-LABEL: atomicrmw_umin_i32_seq_cst: +; RV32IB-ZALRSC: # %bb.0: +; RV32IB-ZALRSC-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 +; RV32IB-ZALRSC-NEXT: lr.w.aqrl a2, (a0) +; RV32IB-ZALRSC-NEXT: mv a3, a2 +; RV32IB-ZALRSC-NEXT: bgeu a1, a3, .LBB3_3 +; RV32IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1 +; RV32IB-ZALRSC-NEXT: mv a3, a1 +; RV32IB-ZALRSC-NEXT: .LBB3_3: # in Loop: Header=BB3_1 Depth=1 +; RV32IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0) +; RV32IB-ZALRSC-NEXT: bnez a3, .LBB3_1 +; RV32IB-ZALRSC-NEXT: # %bb.4: +; RV32IB-ZALRSC-NEXT: mv a0, a2 +; RV32IB-ZALRSC-NEXT: ret +; +; RV32IB-ZALRSC-PERM-LABEL: atomicrmw_umin_i32_seq_cst: +; RV32IB-ZALRSC-PERM: # %bb.0: +; RV32IB-ZALRSC-PERM-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 +; RV32IB-ZALRSC-PERM-NEXT: lr.w.aqrl a2, (a0) +; RV32IB-ZALRSC-PERM-NEXT: minu a3, a2, a1 +; RV32IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0) +; RV32IB-ZALRSC-PERM-NEXT: bnez a3, .LBB3_1 +; RV32IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV32IB-ZALRSC-PERM-NEXT: mv a0, a2 +; RV32IB-ZALRSC-PERM-NEXT: ret +; +; RV32IAB-LABEL: atomicrmw_umin_i32_seq_cst: +; RV32IAB: # %bb.0: +; RV32IAB-NEXT: amominu.w.aqrl a0, a1, (a0) +; RV32IAB-NEXT: ret +; +; RV64IB-ZALRSC-LABEL: atomicrmw_umin_i32_seq_cst: +; RV64IB-ZALRSC: # %bb.0: +; RV64IB-ZALRSC-NEXT: sext.w a2, a1 +; RV64IB-ZALRSC-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-NEXT: lr.w.aqrl a1, (a0) +; RV64IB-ZALRSC-NEXT: mv a3, a1 +; RV64IB-ZALRSC-NEXT: bgeu a2, a3, .LBB3_3 +; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1 +; RV64IB-ZALRSC-NEXT: mv a3, a2 +; RV64IB-ZALRSC-NEXT: .LBB3_3: # in Loop: Header=BB3_1 Depth=1 +; RV64IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0) +; RV64IB-ZALRSC-NEXT: bnez a3, .LBB3_1 +; RV64IB-ZALRSC-NEXT: # %bb.4: +; RV64IB-ZALRSC-NEXT: mv a0, a1 +; RV64IB-ZALRSC-NEXT: ret +; +; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_umin_i32_seq_cst: +; RV64IB-ZALRSC-PERM: # %bb.0: +; RV64IB-ZALRSC-PERM-NEXT: sext.w a2, a1 +; RV64IB-ZALRSC-PERM-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-PERM-NEXT: lr.w.aqrl a1, (a0) +; RV64IB-ZALRSC-PERM-NEXT: minu a3, a1, a2 +; RV64IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0) +; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB3_1 +; RV64IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV64IB-ZALRSC-PERM-NEXT: mv a0, a1 +; RV64IB-ZALRSC-PERM-NEXT: ret +; +; RV64IAB-LABEL: atomicrmw_umin_i32_seq_cst: +; RV64IAB: # %bb.0: +; RV64IAB-NEXT: amominu.w.aqrl a0, a1, (a0) +; RV64IAB-NEXT: ret + %1 = atomicrmw umin ptr %a, i32 %b seq_cst + ret i32 %1 +} + +define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind { +; RV32IB-COMMON-LABEL: atomicrmw_max_i64_seq_cst: +; RV32IB-COMMON: # %bb.0: +; RV32IB-COMMON-NEXT: addi sp, sp, -32 +; RV32IB-COMMON-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s1, 20(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s2, 16(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: mv s0, a2 +; RV32IB-COMMON-NEXT: mv s1, a0 +; RV32IB-COMMON-NEXT: lw a4, 0(a0) +; RV32IB-COMMON-NEXT: lw a5, 4(a0) +; RV32IB-COMMON-NEXT: mv s2, a1 +; RV32IB-COMMON-NEXT: j .LBB4_2 +; RV32IB-COMMON-NEXT: .LBB4_1: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB4_2 Depth=1 +; RV32IB-COMMON-NEXT: sw a4, 8(sp) +; RV32IB-COMMON-NEXT: sw a5, 12(sp) +; RV32IB-COMMON-NEXT: addi a1, sp, 8 +; RV32IB-COMMON-NEXT: li a4, 5 +; RV32IB-COMMON-NEXT: li a5, 5 +; RV32IB-COMMON-NEXT: mv a0, s1 +; RV32IB-COMMON-NEXT: call __atomic_compare_exchange_8 +; RV32IB-COMMON-NEXT: lw a4, 8(sp) +; RV32IB-COMMON-NEXT: lw a5, 12(sp) +; RV32IB-COMMON-NEXT: bnez a0, .LBB4_7 +; RV32IB-COMMON-NEXT: .LBB4_2: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IB-COMMON-NEXT: beq a5, s0, .LBB4_4 +; RV32IB-COMMON-NEXT: # %bb.3: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB4_2 Depth=1 +; RV32IB-COMMON-NEXT: slt a0, s0, a5 +; RV32IB-COMMON-NEXT: j .LBB4_5 +; RV32IB-COMMON-NEXT: .LBB4_4: # in Loop: Header=BB4_2 Depth=1 +; RV32IB-COMMON-NEXT: sltu a0, s2, a4 +; RV32IB-COMMON-NEXT: .LBB4_5: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB4_2 Depth=1 +; RV32IB-COMMON-NEXT: mv a2, a4 +; RV32IB-COMMON-NEXT: mv a3, a5 +; RV32IB-COMMON-NEXT: bnez a0, .LBB4_1 +; RV32IB-COMMON-NEXT: # %bb.6: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB4_2 Depth=1 +; RV32IB-COMMON-NEXT: mv a2, s2 +; RV32IB-COMMON-NEXT: mv a3, s0 +; RV32IB-COMMON-NEXT: j .LBB4_1 +; RV32IB-COMMON-NEXT: .LBB4_7: # %atomicrmw.end +; RV32IB-COMMON-NEXT: mv a0, a4 +; RV32IB-COMMON-NEXT: mv a1, a5 +; RV32IB-COMMON-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s2, 16(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: addi sp, sp, 32 +; RV32IB-COMMON-NEXT: ret +; +; RV64IB-ZALRSC-LABEL: atomicrmw_max_i64_seq_cst: +; RV64IB-ZALRSC: # %bb.0: +; RV64IB-ZALRSC-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-NEXT: lr.d.aqrl a2, (a0) +; RV64IB-ZALRSC-NEXT: mv a3, a2 +; RV64IB-ZALRSC-NEXT: bge a3, a1, .LBB4_3 +; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB4_1 Depth=1 +; RV64IB-ZALRSC-NEXT: mv a3, a1 +; RV64IB-ZALRSC-NEXT: .LBB4_3: # in Loop: Header=BB4_1 Depth=1 +; RV64IB-ZALRSC-NEXT: sc.d.rl a3, a3, (a0) +; RV64IB-ZALRSC-NEXT: bnez a3, .LBB4_1 +; RV64IB-ZALRSC-NEXT: # %bb.4: +; RV64IB-ZALRSC-NEXT: mv a0, a2 +; RV64IB-ZALRSC-NEXT: ret +; +; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_max_i64_seq_cst: +; RV64IB-ZALRSC-PERM: # %bb.0: +; RV64IB-ZALRSC-PERM-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-PERM-NEXT: lr.d.aqrl a2, (a0) +; RV64IB-ZALRSC-PERM-NEXT: max a3, a2, a1 +; RV64IB-ZALRSC-PERM-NEXT: sc.d.rl a3, a3, (a0) +; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB4_1 +; RV64IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV64IB-ZALRSC-PERM-NEXT: mv a0, a2 +; RV64IB-ZALRSC-PERM-NEXT: ret +; +; RV64IAB-LABEL: atomicrmw_max_i64_seq_cst: +; RV64IAB: # %bb.0: +; RV64IAB-NEXT: amomax.d.aqrl a0, a1, (a0) +; RV64IAB-NEXT: ret + %1 = atomicrmw max ptr %a, i64 %b seq_cst + ret i64 %1 +} + +define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind { +; RV32IB-COMMON-LABEL: atomicrmw_min_i64_seq_cst: +; RV32IB-COMMON: # %bb.0: +; RV32IB-COMMON-NEXT: addi sp, sp, -32 +; RV32IB-COMMON-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s1, 20(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s2, 16(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: mv s0, a2 +; RV32IB-COMMON-NEXT: mv s1, a0 +; RV32IB-COMMON-NEXT: lw a4, 0(a0) +; RV32IB-COMMON-NEXT: lw a5, 4(a0) +; RV32IB-COMMON-NEXT: mv s2, a1 +; RV32IB-COMMON-NEXT: j .LBB5_2 +; RV32IB-COMMON-NEXT: .LBB5_1: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB5_2 Depth=1 +; RV32IB-COMMON-NEXT: sw a4, 8(sp) +; RV32IB-COMMON-NEXT: sw a5, 12(sp) +; RV32IB-COMMON-NEXT: addi a1, sp, 8 +; RV32IB-COMMON-NEXT: li a4, 5 +; RV32IB-COMMON-NEXT: li a5, 5 +; RV32IB-COMMON-NEXT: mv a0, s1 +; RV32IB-COMMON-NEXT: call __atomic_compare_exchange_8 +; RV32IB-COMMON-NEXT: lw a4, 8(sp) +; RV32IB-COMMON-NEXT: lw a5, 12(sp) +; RV32IB-COMMON-NEXT: bnez a0, .LBB5_7 +; RV32IB-COMMON-NEXT: .LBB5_2: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IB-COMMON-NEXT: beq a5, s0, .LBB5_4 +; RV32IB-COMMON-NEXT: # %bb.3: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB5_2 Depth=1 +; RV32IB-COMMON-NEXT: slt a0, a5, s0 +; RV32IB-COMMON-NEXT: j .LBB5_5 +; RV32IB-COMMON-NEXT: .LBB5_4: # in Loop: Header=BB5_2 Depth=1 +; RV32IB-COMMON-NEXT: sltu a0, a4, s2 +; RV32IB-COMMON-NEXT: .LBB5_5: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB5_2 Depth=1 +; RV32IB-COMMON-NEXT: mv a2, a4 +; RV32IB-COMMON-NEXT: mv a3, a5 +; RV32IB-COMMON-NEXT: bnez a0, .LBB5_1 +; RV32IB-COMMON-NEXT: # %bb.6: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB5_2 Depth=1 +; RV32IB-COMMON-NEXT: mv a2, s2 +; RV32IB-COMMON-NEXT: mv a3, s0 +; RV32IB-COMMON-NEXT: j .LBB5_1 +; RV32IB-COMMON-NEXT: .LBB5_7: # %atomicrmw.end +; RV32IB-COMMON-NEXT: mv a0, a4 +; RV32IB-COMMON-NEXT: mv a1, a5 +; RV32IB-COMMON-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s2, 16(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: addi sp, sp, 32 +; RV32IB-COMMON-NEXT: ret +; +; RV64IB-ZALRSC-LABEL: atomicrmw_min_i64_seq_cst: +; RV64IB-ZALRSC: # %bb.0: +; RV64IB-ZALRSC-NEXT: .LBB5_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-NEXT: lr.d.aqrl a2, (a0) +; RV64IB-ZALRSC-NEXT: mv a3, a2 +; RV64IB-ZALRSC-NEXT: bge a1, a3, .LBB5_3 +; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB5_1 Depth=1 +; RV64IB-ZALRSC-NEXT: mv a3, a1 +; RV64IB-ZALRSC-NEXT: .LBB5_3: # in Loop: Header=BB5_1 Depth=1 +; RV64IB-ZALRSC-NEXT: sc.d.rl a3, a3, (a0) +; RV64IB-ZALRSC-NEXT: bnez a3, .LBB5_1 +; RV64IB-ZALRSC-NEXT: # %bb.4: +; RV64IB-ZALRSC-NEXT: mv a0, a2 +; RV64IB-ZALRSC-NEXT: ret +; +; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_min_i64_seq_cst: +; RV64IB-ZALRSC-PERM: # %bb.0: +; RV64IB-ZALRSC-PERM-NEXT: .LBB5_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-PERM-NEXT: lr.d.aqrl a2, (a0) +; RV64IB-ZALRSC-PERM-NEXT: min a3, a2, a1 +; RV64IB-ZALRSC-PERM-NEXT: sc.d.rl a3, a3, (a0) +; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB5_1 +; RV64IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV64IB-ZALRSC-PERM-NEXT: mv a0, a2 +; RV64IB-ZALRSC-PERM-NEXT: ret +; +; RV64IAB-LABEL: atomicrmw_min_i64_seq_cst: +; RV64IAB: # %bb.0: +; RV64IAB-NEXT: amomin.d.aqrl a0, a1, (a0) +; RV64IAB-NEXT: ret + %1 = atomicrmw min ptr %a, i64 %b seq_cst + ret i64 %1 +} + +define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind { +; RV32IB-COMMON-LABEL: atomicrmw_umax_i64_seq_cst: +; RV32IB-COMMON: # %bb.0: +; RV32IB-COMMON-NEXT: addi sp, sp, -32 +; RV32IB-COMMON-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s1, 20(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s2, 16(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: mv s0, a2 +; RV32IB-COMMON-NEXT: mv s1, a0 +; RV32IB-COMMON-NEXT: lw a4, 0(a0) +; RV32IB-COMMON-NEXT: lw a5, 4(a0) +; RV32IB-COMMON-NEXT: mv s2, a1 +; RV32IB-COMMON-NEXT: j .LBB6_2 +; RV32IB-COMMON-NEXT: .LBB6_1: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB6_2 Depth=1 +; RV32IB-COMMON-NEXT: sw a4, 8(sp) +; RV32IB-COMMON-NEXT: sw a5, 12(sp) +; RV32IB-COMMON-NEXT: addi a1, sp, 8 +; RV32IB-COMMON-NEXT: li a4, 5 +; RV32IB-COMMON-NEXT: li a5, 5 +; RV32IB-COMMON-NEXT: mv a0, s1 +; RV32IB-COMMON-NEXT: call __atomic_compare_exchange_8 +; RV32IB-COMMON-NEXT: lw a4, 8(sp) +; RV32IB-COMMON-NEXT: lw a5, 12(sp) +; RV32IB-COMMON-NEXT: bnez a0, .LBB6_7 +; RV32IB-COMMON-NEXT: .LBB6_2: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IB-COMMON-NEXT: beq a5, s0, .LBB6_4 +; RV32IB-COMMON-NEXT: # %bb.3: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB6_2 Depth=1 +; RV32IB-COMMON-NEXT: sltu a0, s0, a5 +; RV32IB-COMMON-NEXT: j .LBB6_5 +; RV32IB-COMMON-NEXT: .LBB6_4: # in Loop: Header=BB6_2 Depth=1 +; RV32IB-COMMON-NEXT: sltu a0, s2, a4 +; RV32IB-COMMON-NEXT: .LBB6_5: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB6_2 Depth=1 +; RV32IB-COMMON-NEXT: mv a2, a4 +; RV32IB-COMMON-NEXT: mv a3, a5 +; RV32IB-COMMON-NEXT: bnez a0, .LBB6_1 +; RV32IB-COMMON-NEXT: # %bb.6: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB6_2 Depth=1 +; RV32IB-COMMON-NEXT: mv a2, s2 +; RV32IB-COMMON-NEXT: mv a3, s0 +; RV32IB-COMMON-NEXT: j .LBB6_1 +; RV32IB-COMMON-NEXT: .LBB6_7: # %atomicrmw.end +; RV32IB-COMMON-NEXT: mv a0, a4 +; RV32IB-COMMON-NEXT: mv a1, a5 +; RV32IB-COMMON-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s2, 16(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: addi sp, sp, 32 +; RV32IB-COMMON-NEXT: ret +; +; RV64IB-ZALRSC-LABEL: atomicrmw_umax_i64_seq_cst: +; RV64IB-ZALRSC: # %bb.0: +; RV64IB-ZALRSC-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-NEXT: lr.d.aqrl a2, (a0) +; RV64IB-ZALRSC-NEXT: mv a3, a2 +; RV64IB-ZALRSC-NEXT: bgeu a3, a1, .LBB6_3 +; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB6_1 Depth=1 +; RV64IB-ZALRSC-NEXT: mv a3, a1 +; RV64IB-ZALRSC-NEXT: .LBB6_3: # in Loop: Header=BB6_1 Depth=1 +; RV64IB-ZALRSC-NEXT: sc.d.rl a3, a3, (a0) +; RV64IB-ZALRSC-NEXT: bnez a3, .LBB6_1 +; RV64IB-ZALRSC-NEXT: # %bb.4: +; RV64IB-ZALRSC-NEXT: mv a0, a2 +; RV64IB-ZALRSC-NEXT: ret +; +; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_umax_i64_seq_cst: +; RV64IB-ZALRSC-PERM: # %bb.0: +; RV64IB-ZALRSC-PERM-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-PERM-NEXT: lr.d.aqrl a2, (a0) +; RV64IB-ZALRSC-PERM-NEXT: maxu a3, a2, a1 +; RV64IB-ZALRSC-PERM-NEXT: sc.d.rl a3, a3, (a0) +; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB6_1 +; RV64IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV64IB-ZALRSC-PERM-NEXT: mv a0, a2 +; RV64IB-ZALRSC-PERM-NEXT: ret +; +; RV64IAB-LABEL: atomicrmw_umax_i64_seq_cst: +; RV64IAB: # %bb.0: +; RV64IAB-NEXT: amomaxu.d.aqrl a0, a1, (a0) +; RV64IAB-NEXT: ret + %1 = atomicrmw umax ptr %a, i64 %b seq_cst + ret i64 %1 +} + +define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind { +; RV32IB-COMMON-LABEL: atomicrmw_umin_i64_seq_cst: +; RV32IB-COMMON: # %bb.0: +; RV32IB-COMMON-NEXT: addi sp, sp, -32 +; RV32IB-COMMON-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s1, 20(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s2, 16(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: mv s0, a2 +; RV32IB-COMMON-NEXT: mv s1, a0 +; RV32IB-COMMON-NEXT: lw a4, 0(a0) +; RV32IB-COMMON-NEXT: lw a5, 4(a0) +; RV32IB-COMMON-NEXT: mv s2, a1 +; RV32IB-COMMON-NEXT: j .LBB7_2 +; RV32IB-COMMON-NEXT: .LBB7_1: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB7_2 Depth=1 +; RV32IB-COMMON-NEXT: sw a4, 8(sp) +; RV32IB-COMMON-NEXT: sw a5, 12(sp) +; RV32IB-COMMON-NEXT: addi a1, sp, 8 +; RV32IB-COMMON-NEXT: li a4, 5 +; RV32IB-COMMON-NEXT: li a5, 5 +; RV32IB-COMMON-NEXT: mv a0, s1 +; RV32IB-COMMON-NEXT: call __atomic_compare_exchange_8 +; RV32IB-COMMON-NEXT: lw a4, 8(sp) +; RV32IB-COMMON-NEXT: lw a5, 12(sp) +; RV32IB-COMMON-NEXT: bnez a0, .LBB7_7 +; RV32IB-COMMON-NEXT: .LBB7_2: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IB-COMMON-NEXT: beq a5, s0, .LBB7_4 +; RV32IB-COMMON-NEXT: # %bb.3: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB7_2 Depth=1 +; RV32IB-COMMON-NEXT: sltu a0, a5, s0 +; RV32IB-COMMON-NEXT: j .LBB7_5 +; RV32IB-COMMON-NEXT: .LBB7_4: # in Loop: Header=BB7_2 Depth=1 +; RV32IB-COMMON-NEXT: sltu a0, a4, s2 +; RV32IB-COMMON-NEXT: .LBB7_5: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB7_2 Depth=1 +; RV32IB-COMMON-NEXT: mv a2, a4 +; RV32IB-COMMON-NEXT: mv a3, a5 +; RV32IB-COMMON-NEXT: bnez a0, .LBB7_1 +; RV32IB-COMMON-NEXT: # %bb.6: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB7_2 Depth=1 +; RV32IB-COMMON-NEXT: mv a2, s2 +; RV32IB-COMMON-NEXT: mv a3, s0 +; RV32IB-COMMON-NEXT: j .LBB7_1 +; RV32IB-COMMON-NEXT: .LBB7_7: # %atomicrmw.end +; RV32IB-COMMON-NEXT: mv a0, a4 +; RV32IB-COMMON-NEXT: mv a1, a5 +; RV32IB-COMMON-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s2, 16(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: addi sp, sp, 32 +; RV32IB-COMMON-NEXT: ret +; +; RV64IB-ZALRSC-LABEL: atomicrmw_umin_i64_seq_cst: +; RV64IB-ZALRSC: # %bb.0: +; RV64IB-ZALRSC-NEXT: .LBB7_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-NEXT: lr.d.aqrl a2, (a0) +; RV64IB-ZALRSC-NEXT: mv a3, a2 +; RV64IB-ZALRSC-NEXT: bgeu a1, a3, .LBB7_3 +; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB7_1 Depth=1 +; RV64IB-ZALRSC-NEXT: mv a3, a1 +; RV64IB-ZALRSC-NEXT: .LBB7_3: # in Loop: Header=BB7_1 Depth=1 +; RV64IB-ZALRSC-NEXT: sc.d.rl a3, a3, (a0) +; RV64IB-ZALRSC-NEXT: bnez a3, .LBB7_1 +; RV64IB-ZALRSC-NEXT: # %bb.4: +; RV64IB-ZALRSC-NEXT: mv a0, a2 +; RV64IB-ZALRSC-NEXT: ret +; +; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_umin_i64_seq_cst: +; RV64IB-ZALRSC-PERM: # %bb.0: +; RV64IB-ZALRSC-PERM-NEXT: .LBB7_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-PERM-NEXT: lr.d.aqrl a2, (a0) +; RV64IB-ZALRSC-PERM-NEXT: minu a3, a2, a1 +; RV64IB-ZALRSC-PERM-NEXT: sc.d.rl a3, a3, (a0) +; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB7_1 +; RV64IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV64IB-ZALRSC-PERM-NEXT: mv a0, a2 +; RV64IB-ZALRSC-PERM-NEXT: ret +; +; RV64IAB-LABEL: atomicrmw_umin_i64_seq_cst: +; RV64IAB: # %bb.0: +; RV64IAB-NEXT: amominu.d.aqrl a0, a1, (a0) +; RV64IAB-NEXT: ret + %1 = atomicrmw umin ptr %a, i64 %b seq_cst + ret i64 %1 +} diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll index 5e5f2b7..37e11db 100644 --- a/llvm/test/CodeGen/RISCV/features-info.ll +++ b/llvm/test/CodeGen/RISCV/features-info.ll @@ -81,6 +81,7 @@ ; CHECK-NEXT: optimized-nf7-segment-load-store - vlseg7eN.v and vsseg7eN.v are implemented as a wide memory op and shuffle. ; CHECK-NEXT: optimized-nf8-segment-load-store - vlseg8eN.v and vsseg8eN.v are implemented as a wide memory op and shuffle. ; CHECK-NEXT: optimized-zero-stride-load - Optimized (perform fewer memory operations)zero-stride vector load. +; CHECK-NEXT: permissive-zalrsc - Implementation permits non-base instructions between LR/SC pairs. ; CHECK-NEXT: predictable-select-expensive - Prefer likely predicted branches over selects. ; CHECK-NEXT: prefer-vsetvli-over-read-vlenb - Prefer vsetvli over read vlenb CSR to calculate VLEN. ; CHECK-NEXT: prefer-w-inst - Prefer instructions with W suffix. diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveMin.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveMin.ll new file mode 100644 index 0000000..d121c1a --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveMin.ll @@ -0,0 +1,57 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-vulkan-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-unknown %s -o - -filetype=obj | spirv-val %} + +; Test lowering to spir-v backend for various types and scalar/vector + +; CHECK: OpCapability GroupNonUniformArithmetic + +; CHECK-DAG: %[[#f16:]] = OpTypeFloat 16 +; CHECK-DAG: %[[#f32:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#uint:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#v4_half:]] = OpTypeVector %[[#f16]] 4 +; CHECK-DAG: %[[#scope:]] = OpConstant %[[#uint]] 3 + +; CHECK-LABEL: Begin function test_float +; CHECK: %[[#fexpr:]] = OpFunctionParameter %[[#f32]] +define float @test_float(float %fexpr) { +entry: +; CHECK: %[[#fret:]] = OpGroupNonUniformFMin %[[#f32]] %[[#scope]] Reduce %[[#fexpr]] + %0 = call float @llvm.spv.wave.reduce.min.f32(float %fexpr) + ret float %0 +} + +; CHECK-LABEL: Begin function test_int_signed +; CHECK: %[[#iexpr:]] = OpFunctionParameter %[[#uint]] +define i32 @test_int_signed(i32 %iexpr) { +entry: +; CHECK: %[[#iret:]] = OpGroupNonUniformSMin %[[#uint]] %[[#scope]] Reduce %[[#iexpr]] + %0 = call i32 @llvm.spv.wave.reduce.min.i32(i32 %iexpr) + ret i32 %0 +} + +; CHECK-LABEL: Begin function test_int_unsigned +; CHECK: %[[#iexpr:]] = OpFunctionParameter %[[#uint]] +define i32 @test_int_unsigned(i32 %iexpr) { +entry: +; CHECK: %[[#iret:]] = OpGroupNonUniformUMin %[[#uint]] %[[#scope]] Reduce %[[#iexpr]] + %0 = call i32 @llvm.spv.wave.reduce.umin.i32(i32 %iexpr) + ret i32 %0 +} + +; CHECK-LABEL: Begin function test_vhalf +; CHECK: %[[#vbexpr:]] = OpFunctionParameter %[[#v4_half]] +define <4 x half> @test_vhalf(<4 x half> %vbexpr) { +entry: +; CHECK: %[[#vhalfret:]] = OpGroupNonUniformFMin %[[#v4_half]] %[[#scope]] Reduce %[[#vbexpr]] + %0 = call <4 x half> @llvm.spv.wave.reduce.min.v4half(<4 x half> %vbexpr) + ret <4 x half> %0 +} + +declare float @llvm.spv.wave.reduce.min.f32(float) +declare i32 @llvm.spv.wave.reduce.min.i32(i32) +declare <4 x half> @llvm.spv.wave.reduce.min.v4half(<4 x half>) + +declare float @llvm.spv.wave.reduce.umin.f32(float) +declare i32 @llvm.spv.wave.reduce.umin.i32(i32) +declare <4 x half> @llvm.spv.wave.reduce.umin.v4half(<4 x half>) + diff --git a/llvm/test/MC/AMDGPU/buffer-op-swz-operand.s b/llvm/test/MC/AMDGPU/buffer-op-swz-operand.s index 8bd9148..4542027 100644 --- a/llvm/test/MC/AMDGPU/buffer-op-swz-operand.s +++ b/llvm/test/MC/AMDGPU/buffer-op-swz-operand.s @@ -2,7 +2,7 @@ // CHECK: .amdgcn_target "amdgcn-amd-amdhsa--gfx1100" buffer_load_dwordx4 v[0:3], v0, s[0:3], 0, offen offset:4092 slc -// CHECK: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:4092 slc ; <MCInst #13135 BUFFER_LOAD_DWORDX4_OFFEN_gfx11 +// CHECK: buffer_load_b128 v[0:3], v0, s[0:3], 0 offen offset:4092 slc ; <MCInst #{{[0-9]+}} BUFFER_LOAD_DWORDX4_OFFEN_gfx11 // CHECK-NEXT: ; <MCOperand Reg:10104> // CHECK-NEXT: ; <MCOperand Reg:486> // CHECK-NEXT: ; <MCOperand Reg:7754> @@ -11,7 +11,7 @@ buffer_load_dwordx4 v[0:3], v0, s[0:3], 0, offen offset:4092 slc // CHECK-NEXT: ; <MCOperand Imm:2> // CHECK-NEXT: ; <MCOperand Imm:0>> buffer_store_dword v0, v1, s[0:3], 0 offen slc -// CHECK: buffer_store_b32 v0, v1, s[0:3], 0 offen slc ; <MCInst #14553 BUFFER_STORE_DWORD_OFFEN_gfx11 +// CHECK: buffer_store_b32 v0, v1, s[0:3], 0 offen slc ; <MCInst #{{[0-9]+}} BUFFER_STORE_DWORD_OFFEN_gfx11 // CHECK-NEXT: ; <MCOperand Reg:486> // CHECK-NEXT: ; <MCOperand Reg:487> // CHECK-NEXT: ; <MCOperand Reg:7754> @@ -22,7 +22,7 @@ buffer_store_dword v0, v1, s[0:3], 0 offen slc ; tbuffer ops use autogenerate asm parsers tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:[BUF_FMT_32_32_SINT] offen offset:4092 slc -// CHECK: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:[BUF_FMT_32_32_SINT] offen offset:4092 slc ; <MCInst #34095 TBUFFER_LOAD_FORMAT_XYZW_OFFEN_gfx11 +// CHECK: tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:[BUF_FMT_32_32_SINT] offen offset:4092 slc ; <MCInst #{{[0-9]+}} TBUFFER_LOAD_FORMAT_XYZW_OFFEN_gfx11 // CHECK-NEXT: ; <MCOperand Reg:10104> // CHECK-NEXT: ; <MCOperand Reg:486> // CHECK-NEXT: ; <MCOperand Reg:7754> @@ -32,7 +32,7 @@ tbuffer_load_format_xyzw v[0:3], v0, s[0:3], 0 format:[BUF_FMT_32_32_SINT] offen // CHECK-NEXT: ; <MCOperand Imm:2> // CHECK-NEXT: ; <MCOperand Imm:0>> tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] offen slc -// CHECK: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] offen slc ; <MCInst #34264 TBUFFER_STORE_FORMAT_D16_X_OFFEN_gfx11 +// CHECK: tbuffer_store_d16_format_x v0, v1, s[0:3], 0 format:[BUF_FMT_10_10_10_2_SNORM] offen slc ; <MCInst #{{[0-9]+}} TBUFFER_STORE_FORMAT_D16_X_OFFEN_gfx11 // CHECK-NEXT: ; <MCOperand Reg:486> // CHECK-NEXT: ; <MCOperand Reg:487> // CHECK-NEXT: ; <MCOperand Reg:7754> diff --git a/llvm/test/MC/RISCV/xqcili-linker-relaxation.s b/llvm/test/MC/RISCV/xqcili-linker-relaxation.s new file mode 100644 index 0000000..ace6779 --- /dev/null +++ b/llvm/test/MC/RISCV/xqcili-linker-relaxation.s @@ -0,0 +1,37 @@ +# RUN: llvm-mc --triple=riscv32 -mattr=+relax,+experimental-xqcili \ +# RUN: %s -filetype=obj -o - -riscv-add-build-attributes \ +# RUN: | llvm-objdump -dr -M no-aliases - \ +# RUN: | FileCheck %s + +## This tests that we correctly emit relocations for linker relaxation when +## emitting `QC.E.LI` and `QC.LI`. + + .section .text.ex1, "ax", @progbits +# CHECK-LABEL: <.text.ex1>: + blez a1, .L1 +# CHECK-NEXT: bge zero, a1, 0x0 <.text.ex1> +# CHECK-NEXT: R_RISCV_BRANCH .L1{{$}} + qc.e.li a0, sym +# CHECK-NEXT: qc.e.li a0, 0x0 +# CHECK-NEXT: R_RISCV_VENDOR QUALCOMM{{$}} +# CHECK-NEXT: R_RISCV_CUSTOM194 sym{{$}} +# CHECK-NEXT: R_RISCV_RELAX *ABS*{{$}} +.L1: +# CHECK: <.L1>: + ret +# CHECK-NEXT: c.jr ra + + .section .text.ex2, "ax", @progbits +# CHECK-LABEL: <.text.ex2>: + blez a1, .L2 +# CHECK-NEXT: bge zero, a1, 0x0 <.text.ex2> +# CHECK-NEXT: R_RISCV_BRANCH .L2{{$}} + qc.li a0, %qc.abs20(sym) +# CHECK-NEXT: qc.li a0, 0x0 +# CHECK-NEXT: R_RISCV_VENDOR QUALCOMM{{$}} +# CHECK-NEXT: R_RISCV_CUSTOM192 sym{{$}} +# CHECK-NEXT: R_RISCV_RELAX *ABS*{{$}} +.L2: +# CHECK: <.L2>: + ret +# CHECK-NEXT: c.jr ra diff --git a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll index 964a257..fafa82c 100644 --- a/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll +++ b/llvm/test/Transforms/LoopVectorize/reduction-inloop.ll @@ -2800,6 +2800,88 @@ exit: ret i64 %r.0.lcssa } +define i32 @reduction_expression_ext_mulacc_livein(ptr %a, i16 %c) { +; CHECK-LABEL: define i32 @reduction_expression_ext_mulacc_livein( +; CHECK-SAME: ptr [[A:%.*]], i16 [[C:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK: [[VECTOR_PH]]: +; CHECK-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0 +; CHECK-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK: [[VECTOR_BODY]]: +; CHECK-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP5:%.*]], %[[VECTOR_BODY]] ] +; CHECK-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; CHECK-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1 +; CHECK-NEXT: [[TMP1:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i16> +; CHECK-NEXT: [[TMP2:%.*]] = mul <4 x i16> [[BROADCAST_SPLAT]], [[TMP1]] +; CHECK-NEXT: [[TMP3:%.*]] = zext <4 x i16> [[TMP2]] to <4 x i32> +; CHECK-NEXT: [[TMP4:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP3]]) +; CHECK-NEXT: [[TMP5]] = add i32 [[VEC_PHI]], [[TMP4]] +; CHECK-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4 +; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-NEXT: br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; CHECK: [[MIDDLE_BLOCK]]: +; CHECK-NEXT: br label %[[FOR_EXIT:.*]] +; CHECK: [[FOR_EXIT]]: +; CHECK-NEXT: ret i32 [[TMP5]] +; +; CHECK-INTERLEAVED-LABEL: define i32 @reduction_expression_ext_mulacc_livein( +; CHECK-INTERLEAVED-SAME: ptr [[A:%.*]], i16 [[C:%.*]]) { +; CHECK-INTERLEAVED-NEXT: [[ENTRY:.*:]] +; CHECK-INTERLEAVED-NEXT: br label %[[VECTOR_PH:.*]] +; CHECK-INTERLEAVED: [[VECTOR_PH]]: +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLATINSERT:%.*]] = insertelement <4 x i16> poison, i16 [[C]], i64 0 +; CHECK-INTERLEAVED-NEXT: [[BROADCAST_SPLAT:%.*]] = shufflevector <4 x i16> [[BROADCAST_SPLATINSERT]], <4 x i16> poison, <4 x i32> zeroinitializer +; CHECK-INTERLEAVED-NEXT: br label %[[VECTOR_BODY:.*]] +; CHECK-INTERLEAVED: [[VECTOR_BODY]]: +; CHECK-INTERLEAVED-NEXT: [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP8:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[VEC_PHI1:%.*]] = phi i32 [ 0, %[[VECTOR_PH]] ], [ [[TMP11:%.*]], %[[VECTOR_BODY]] ] +; CHECK-INTERLEAVED-NEXT: [[TMP0:%.*]] = getelementptr i8, ptr [[A]], i64 [[INDEX]] +; CHECK-INTERLEAVED-NEXT: [[TMP1:%.*]] = getelementptr i8, ptr [[TMP0]], i32 4 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD:%.*]] = load <4 x i8>, ptr [[TMP0]], align 1 +; CHECK-INTERLEAVED-NEXT: [[WIDE_LOAD2:%.*]] = load <4 x i8>, ptr [[TMP1]], align 1 +; CHECK-INTERLEAVED-NEXT: [[TMP2:%.*]] = zext <4 x i8> [[WIDE_LOAD]] to <4 x i16> +; CHECK-INTERLEAVED-NEXT: [[TMP3:%.*]] = zext <4 x i8> [[WIDE_LOAD2]] to <4 x i16> +; CHECK-INTERLEAVED-NEXT: [[TMP4:%.*]] = mul <4 x i16> [[BROADCAST_SPLAT]], [[TMP2]] +; CHECK-INTERLEAVED-NEXT: [[TMP5:%.*]] = mul <4 x i16> [[BROADCAST_SPLAT]], [[TMP3]] +; CHECK-INTERLEAVED-NEXT: [[TMP6:%.*]] = zext <4 x i16> [[TMP4]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP7:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP6]]) +; CHECK-INTERLEAVED-NEXT: [[TMP8]] = add i32 [[VEC_PHI]], [[TMP7]] +; CHECK-INTERLEAVED-NEXT: [[TMP9:%.*]] = zext <4 x i16> [[TMP5]] to <4 x i32> +; CHECK-INTERLEAVED-NEXT: [[TMP10:%.*]] = call i32 @llvm.vector.reduce.add.v4i32(<4 x i32> [[TMP9]]) +; CHECK-INTERLEAVED-NEXT: [[TMP11]] = add i32 [[VEC_PHI1]], [[TMP10]] +; CHECK-INTERLEAVED-NEXT: [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8 +; CHECK-INTERLEAVED-NEXT: [[TMP12:%.*]] = icmp eq i64 [[INDEX_NEXT]], 1024 +; CHECK-INTERLEAVED-NEXT: br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP32:![0-9]+]] +; CHECK-INTERLEAVED: [[MIDDLE_BLOCK]]: +; CHECK-INTERLEAVED-NEXT: [[BIN_RDX:%.*]] = add i32 [[TMP11]], [[TMP8]] +; CHECK-INTERLEAVED-NEXT: br label %[[FOR_EXIT:.*]] +; CHECK-INTERLEAVED: [[FOR_EXIT]]: +; CHECK-INTERLEAVED-NEXT: ret i32 [[BIN_RDX]] +; +entry: + br label %for.body + +for.body: ; preds = %for.body, %entry + %iv = phi i64 [ 0, %entry ], [ %iv.next, %for.body ] + %accum = phi i32 [ 0, %entry ], [ %add, %for.body ] + %gep.a = getelementptr i8, ptr %a, i64 %iv + %load.a = load i8, ptr %gep.a, align 1 + %ext.a = zext i8 %load.a to i16 + %mul = mul i16 %c, %ext.a + %mul.ext = zext i16 %mul to i32 + %add = add i32 %mul.ext, %accum + %iv.next = add i64 %iv, 1 + %exitcond.not = icmp eq i64 %iv.next, 1024 + br i1 %exitcond.not, label %for.exit, label %for.body + +for.exit: ; preds = %for.body + ret i32 %add +} + declare float @llvm.fmuladd.f32(float, float, float) !6 = distinct !{!6, !7, !8} diff --git a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll index 06b0448..291ada8 100644 --- a/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll +++ b/llvm/test/Transforms/LoopVectorize/vplan-printing-reductions.ll @@ -800,3 +800,545 @@ exit: %r.0.lcssa = phi i64 [ %rdx.next, %loop ] ret i64 %r.0.lcssa } + +define i32 @print_mulacc_extended_const(ptr %start, ptr %end) { +; CHECK-LABEL: 'print_mulacc_extended_const' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<%0> = VF +; CHECK-NEXT: Live-in vp<%1> = VF * UF +; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: vp<%3> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<entry>: +; CHECK-NEXT: EMIT vp<%3> = EXPAND SCEV (1 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1> +; CHECK-NEXT: EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1> +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: <x1> vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9> +; CHECK-NEXT: vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7> +; CHECK-NEXT: vp<%8> = vector-pointer vp<%next.gep> +; CHECK-NEXT: WIDEN ir<%l> = load vp<%8> +; CHECK-NEXT: EXPRESSION vp<%9> = ir<%red> + reduce.add (mul (ir<%l> zext to i32), (ir<63> zext to i32)) +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<exit>: +; CHECK-NEXT: IR %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb<entry> ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb<entry> ] +; CHECK-NEXT: Successor(s): ir-bb<loop> +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<loop>: +; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK-NEXT: IR %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) +; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 +; CHECK-NEXT: IR %l.ext = zext i8 %l to i32 +; CHECK-NEXT: IR %mul = mul i32 %l.ext, 63 +; CHECK-NEXT: IR %red.next = add i32 %red, %mul +; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 +; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK: VPlan 'Final VPlan for VF={4},UF={1}' { +; CHECK-NEXT: Live-in ir<%1> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<entry>: +; CHECK-NEXT: IR %start2 = ptrtoint ptr %start to i64 +; CHECK-NEXT: IR %end1 = ptrtoint ptr %end to i64 +; CHECK-NEXT: IR %0 = add i64 %end1, 1 +; CHECK-NEXT: IR %1 = sub i64 %0, %start2 +; CHECK-NEXT: EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4> +; CHECK-NEXT: EMIT branch-on-cond vp<%min.iters.check> +; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4> +; CHECK-NEXT: EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf> +; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1> +; CHECK-NEXT: Successor(s): vector.body +; CHECK-EMPTY: +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index> +; CHECK-NEXT: WIDEN ir<%l> = load vp<%next.gep> +; CHECK-NEXT: WIDEN-CAST ir<%l.ext> = zext ir<%l> to i32 +; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%l.ext>, ir<63> +; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + reduce.add (ir<%mul>) +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<4> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%n.vec> +; CHECK-NEXT: Successor(s): middle.block, vector.body +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%5> = compute-reduction-result ir<%red>, ir<%red.next> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb<exit>, ir-bb<scalar.ph> +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<exit>: +; CHECK-NEXT: IR %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%5> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<scalar.ph>: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb<entry> ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%5>, middle.block ], [ ir<0>, ir-bb<entry> ] +; CHECK-NEXT: Successor(s): ir-bb<loop> +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<loop>: +; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>) +; CHECK-NEXT: IR %red = phi i32 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb<scalar.ph>) +; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 +; CHECK-NEXT: IR %l.ext = zext i8 %l to i32 +; CHECK-NEXT: IR %mul = mul i32 %l.ext, 63 +; CHECK-NEXT: IR %red.next = add i32 %red, %mul +; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 +; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end +; CHECK-NEXT: No successors +; CHECK-NEXT: } +entry: + br label %loop + +loop: + %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] + %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] + %l = load i8, ptr %ptr.iv, align 1 + %l.ext = zext i8 %l to i32 + %mul = mul i32 %l.ext, 63 + %red.next = add i32 %red, %mul + %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 + %ec = icmp eq ptr %ptr.iv, %end + br i1 %ec, label %exit, label %loop + +exit: + ret i32 %red.next +} + +; Constants >= 128 cannot be treated as sign-extended, so the expression shouldn't extend 128 +define i32 @print_mulacc_not_extended_const(ptr %start, ptr %end) { +; CHECK-LABEL: 'print_mulacc_not_extended_const' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<%0> = VF +; CHECK-NEXT: Live-in vp<%1> = VF * UF +; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: vp<%3> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<entry>: +; CHECK-NEXT: EMIT vp<%3> = EXPAND SCEV (1 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1> +; CHECK-NEXT: EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1> +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: <x1> vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9> +; CHECK-NEXT: vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7> +; CHECK-NEXT: vp<%8> = vector-pointer vp<%next.gep> +; CHECK-NEXT: WIDEN ir<%l> = load vp<%8> +; CHECK-NEXT: WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32 +; CHECK-NEXT: EXPRESSION vp<%9> = ir<%red> + reduce.add (mul ir<%l.ext>, ir<128>) +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<exit>: +; CHECK-NEXT: IR %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb<entry> ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb<entry> ] +; CHECK-NEXT: Successor(s): ir-bb<loop> +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<loop>: +; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK-NEXT: IR %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) +; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 +; CHECK-NEXT: IR %l.ext = sext i8 %l to i32 +; CHECK-NEXT: IR %mul = mul i32 %l.ext, 128 +; CHECK-NEXT: IR %red.next = add i32 %red, %mul +; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 +; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK: VPlan 'Final VPlan for VF={4},UF={1}' { +; CHECK-NEXT: Live-in ir<%1> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<entry>: +; CHECK-NEXT: IR %start2 = ptrtoint ptr %start to i64 +; CHECK-NEXT: IR %end1 = ptrtoint ptr %end to i64 +; CHECK-NEXT: IR %0 = add i64 %end1, 1 +; CHECK-NEXT: IR %1 = sub i64 %0, %start2 +; CHECK-NEXT: EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4> +; CHECK-NEXT: EMIT branch-on-cond vp<%min.iters.check> +; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4> +; CHECK-NEXT: EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf> +; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1> +; CHECK-NEXT: Successor(s): vector.body +; CHECK-EMPTY: +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index> +; CHECK-NEXT: WIDEN ir<%l> = load vp<%next.gep> +; CHECK-NEXT: WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32 +; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%l.ext>, ir<128> +; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + reduce.add (ir<%mul>) +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<4> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%n.vec> +; CHECK-NEXT: Successor(s): middle.block, vector.body +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%5> = compute-reduction-result ir<%red>, ir<%red.next> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb<exit>, ir-bb<scalar.ph> +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<exit>: +; CHECK-NEXT: IR %red.next.lcssa = phi i32 [ %red.next, %loop ] (extra operand: vp<%5> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<scalar.ph>: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb<entry> ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%5>, middle.block ], [ ir<0>, ir-bb<entry> ] +; CHECK-NEXT: Successor(s): ir-bb<loop> +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<loop>: +; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>) +; CHECK-NEXT: IR %red = phi i32 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb<scalar.ph>) +; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 +; CHECK-NEXT: IR %l.ext = sext i8 %l to i32 +; CHECK-NEXT: IR %mul = mul i32 %l.ext, 128 +; CHECK-NEXT: IR %red.next = add i32 %red, %mul +; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 +; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end +; CHECK-NEXT: No successors +; CHECK-NEXT: } +entry: + br label %loop + +loop: + %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] + %red = phi i32 [ 0, %entry ], [ %red.next, %loop ] + %l = load i8, ptr %ptr.iv, align 1 + %l.ext = sext i8 %l to i32 + %mul = mul i32 %l.ext, 128 + %red.next = add i32 %red, %mul + %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 + %ec = icmp eq ptr %ptr.iv, %end + br i1 %ec, label %exit, label %loop + +exit: + %red.next.lcssa = phi i32 [ %red.next, %loop ] + ret i32 %red.next.lcssa +} + +define i64 @print_ext_mulacc_extended_const(ptr %start, ptr %end) { +; CHECK-LABEL: 'print_ext_mulacc_extended_const' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<%0> = VF +; CHECK-NEXT: Live-in vp<%1> = VF * UF +; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: vp<%3> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<entry>: +; CHECK-NEXT: EMIT vp<%3> = EXPAND SCEV (1 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1> +; CHECK-NEXT: EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1> +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: <x1> vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9> +; CHECK-NEXT: vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7> +; CHECK-NEXT: vp<%8> = vector-pointer vp<%next.gep> +; CHECK-NEXT: WIDEN ir<%l> = load vp<%8> +; CHECK-NEXT: EXPRESSION vp<%9> = ir<%red> + reduce.add (mul (ir<%l> zext to i64), (ir<63> zext to i64)) +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<exit>: +; CHECK-NEXT: IR %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb<entry> ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb<entry> ] +; CHECK-NEXT: Successor(s): ir-bb<loop> +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<loop>: +; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK-NEXT: IR %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) +; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 +; CHECK-NEXT: IR %l.ext = zext i8 %l to i32 +; CHECK-NEXT: IR %mul = mul i32 %l.ext, 63 +; CHECK-NEXT: IR %mul.ext = zext i32 %mul to i64 +; CHECK-NEXT: IR %red.next = add i64 %red, %mul.ext +; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 +; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK: VPlan 'Final VPlan for VF={4},UF={1}' { +; CHECK-NEXT: Live-in ir<%1> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<entry>: +; CHECK-NEXT: IR %start2 = ptrtoint ptr %start to i64 +; CHECK-NEXT: IR %end1 = ptrtoint ptr %end to i64 +; CHECK-NEXT: IR %0 = add i64 %end1, 1 +; CHECK-NEXT: IR %1 = sub i64 %0, %start2 +; CHECK-NEXT: EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4> +; CHECK-NEXT: EMIT branch-on-cond vp<%min.iters.check> +; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4> +; CHECK-NEXT: EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf> +; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1> +; CHECK-NEXT: Successor(s): vector.body +; CHECK-EMPTY: +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index> +; CHECK-NEXT: WIDEN ir<%l> = load vp<%next.gep> +; CHECK-NEXT: WIDEN-CAST vp<%4> = zext ir<%l> to i64 +; CHECK-NEXT: WIDEN ir<%mul> = mul vp<%4>, ir<63> +; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + reduce.add (ir<%mul>) +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<4> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%n.vec> +; CHECK-NEXT: Successor(s): middle.block, vector.body +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%6> = compute-reduction-result ir<%red>, ir<%red.next> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb<exit>, ir-bb<scalar.ph> +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<exit>: +; CHECK-NEXT: IR %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%6> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<scalar.ph>: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb<entry> ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%6>, middle.block ], [ ir<0>, ir-bb<entry> ] +; CHECK-NEXT: Successor(s): ir-bb<loop> +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<loop>: +; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>) +; CHECK-NEXT: IR %red = phi i64 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb<scalar.ph>) +; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 +; CHECK-NEXT: IR %l.ext = zext i8 %l to i32 +; CHECK-NEXT: IR %mul = mul i32 %l.ext, 63 +; CHECK-NEXT: IR %mul.ext = zext i32 %mul to i64 +; CHECK-NEXT: IR %red.next = add i64 %red, %mul.ext +; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 +; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end +; CHECK-NEXT: No successors +; CHECK-NEXT: } +entry: + br label %loop + +loop: + %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] + %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] + %l = load i8, ptr %ptr.iv, align 1 + %l.ext = zext i8 %l to i32 + %mul = mul i32 %l.ext, 63 + %mul.ext = zext i32 %mul to i64 + %red.next = add i64 %red, %mul.ext + %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 + %ec = icmp eq ptr %ptr.iv, %end + br i1 %ec, label %exit, label %loop + +exit: + ret i64 %red.next +} + +; Constants >= 128 cannot be treated as sign-extended, so the expression shouldn't extend 128 +define i64 @print_ext_mulacc_not_extended_const(ptr %start, ptr %end) { +; CHECK-LABEL: 'print_ext_mulacc_not_extended_const' +; CHECK: VPlan 'Initial VPlan for VF={4},UF>=1' { +; CHECK-NEXT: Live-in vp<%0> = VF +; CHECK-NEXT: Live-in vp<%1> = VF * UF +; CHECK-NEXT: Live-in vp<%2> = vector-trip-count +; CHECK-NEXT: vp<%3> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<entry>: +; CHECK-NEXT: EMIT vp<%3> = EXPAND SCEV (1 + (-1 * (ptrtoint ptr %start to i64)) + (ptrtoint ptr %end to i64)) +; CHECK-NEXT: Successor(s): scalar.ph, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: vp<%4> = DERIVED-IV ir<%start> + vp<%2> * ir<1> +; CHECK-NEXT: EMIT vp<%5> = reduction-start-vector ir<0>, ir<0>, ir<1> +; CHECK-NEXT: Successor(s): vector loop +; CHECK-EMPTY: +; CHECK-NEXT: <x1> vector loop: { +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT vp<%6> = CANONICAL-INDUCTION ir<0>, vp<%index.next> +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi vp<%5>, vp<%9> +; CHECK-NEXT: vp<%7> = SCALAR-STEPS vp<%6>, ir<1>, vp<%0> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%7> +; CHECK-NEXT: vp<%8> = vector-pointer vp<%next.gep> +; CHECK-NEXT: WIDEN ir<%l> = load vp<%8> +; CHECK-NEXT: WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32 +; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%l.ext>, ir<128> +; CHECK-NEXT: EXPRESSION vp<%9> = ir<%red> + reduce.add (ir<%mul> sext to i64) +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%6>, vp<%1> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%2> +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK-NEXT: Successor(s): middle.block +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%11> = compute-reduction-result ir<%red>, vp<%9> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq vp<%3>, vp<%2> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb<exit>, scalar.ph +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<exit>: +; CHECK-NEXT: IR %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%11> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: scalar.ph: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%4>, middle.block ], [ ir<%start>, ir-bb<entry> ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%11>, middle.block ], [ ir<0>, ir-bb<entry> ] +; CHECK-NEXT: Successor(s): ir-bb<loop> +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<loop>: +; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from scalar.ph) +; CHECK-NEXT: IR %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from scalar.ph) +; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 +; CHECK-NEXT: IR %l.ext = sext i8 %l to i32 +; CHECK-NEXT: IR %mul = mul i32 %l.ext, 128 +; CHECK-NEXT: IR %mul.ext = sext i32 %mul to i64 +; CHECK-NEXT: IR %red.next = add i64 %red, %mul.ext +; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 +; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end +; CHECK-NEXT: No successors +; CHECK-NEXT: } +; CHECK: VPlan 'Final VPlan for VF={4},UF={1}' { +; CHECK-NEXT: Live-in ir<%1> = original trip-count +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<entry>: +; CHECK-NEXT: IR %start2 = ptrtoint ptr %start to i64 +; CHECK-NEXT: IR %end1 = ptrtoint ptr %end to i64 +; CHECK-NEXT: IR %0 = add i64 %end1, 1 +; CHECK-NEXT: IR %1 = sub i64 %0, %start2 +; CHECK-NEXT: EMIT vp<%min.iters.check> = icmp ult ir<%1>, ir<4> +; CHECK-NEXT: EMIT branch-on-cond vp<%min.iters.check> +; CHECK-NEXT: Successor(s): ir-bb<scalar.ph>, vector.ph +; CHECK-EMPTY: +; CHECK-NEXT: vector.ph: +; CHECK-NEXT: EMIT vp<%n.mod.vf> = urem ir<%1>, ir<4> +; CHECK-NEXT: EMIT vp<%n.vec> = sub ir<%1>, vp<%n.mod.vf> +; CHECK-NEXT: vp<%3> = DERIVED-IV ir<%start> + vp<%n.vec> * ir<1> +; CHECK-NEXT: Successor(s): vector.body +; CHECK-EMPTY: +; CHECK-NEXT: vector.body: +; CHECK-NEXT: EMIT-SCALAR vp<%index> = phi [ ir<0>, vector.ph ], [ vp<%index.next>, vector.body ] +; CHECK-NEXT: WIDEN-REDUCTION-PHI ir<%red> = phi ir<0>, ir<%red.next> +; CHECK-NEXT: EMIT vp<%next.gep> = ptradd ir<%start>, vp<%index> +; CHECK-NEXT: WIDEN ir<%l> = load vp<%next.gep> +; CHECK-NEXT: WIDEN-CAST ir<%l.ext> = sext ir<%l> to i32 +; CHECK-NEXT: WIDEN ir<%mul> = mul ir<%l.ext>, ir<128> +; CHECK-NEXT: WIDEN-CAST ir<%mul.ext> = sext ir<%mul> to i64 +; CHECK-NEXT: REDUCE ir<%red.next> = ir<%red> + reduce.add (ir<%mul.ext>) +; CHECK-NEXT: EMIT vp<%index.next> = add nuw vp<%index>, ir<4> +; CHECK-NEXT: EMIT branch-on-count vp<%index.next>, vp<%n.vec> +; CHECK-NEXT: Successor(s): middle.block, vector.body +; CHECK-EMPTY: +; CHECK-NEXT: middle.block: +; CHECK-NEXT: EMIT vp<%5> = compute-reduction-result ir<%red>, ir<%red.next> +; CHECK-NEXT: EMIT vp<%cmp.n> = icmp eq ir<%1>, vp<%n.vec> +; CHECK-NEXT: EMIT branch-on-cond vp<%cmp.n> +; CHECK-NEXT: Successor(s): ir-bb<exit>, ir-bb<scalar.ph> +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<exit>: +; CHECK-NEXT: IR %red.next.lcssa = phi i64 [ %red.next, %loop ] (extra operand: vp<%5> from middle.block) +; CHECK-NEXT: No successors +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<scalar.ph>: +; CHECK-NEXT: EMIT-SCALAR vp<%bc.resume.val> = phi [ vp<%3>, middle.block ], [ ir<%start>, ir-bb<entry> ] +; CHECK-NEXT: EMIT-SCALAR vp<%bc.merge.rdx> = phi [ vp<%5>, middle.block ], [ ir<0>, ir-bb<entry> ] +; CHECK-NEXT: Successor(s): ir-bb<loop> +; CHECK-EMPTY: +; CHECK-NEXT: ir-bb<loop>: +; CHECK-NEXT: IR %ptr.iv = phi ptr [ %start, %scalar.ph ], [ %gep.iv.next, %loop ] (extra operand: vp<%bc.resume.val> from ir-bb<scalar.ph>) +; CHECK-NEXT: IR %red = phi i64 [ 0, %scalar.ph ], [ %red.next, %loop ] (extra operand: vp<%bc.merge.rdx> from ir-bb<scalar.ph>) +; CHECK-NEXT: IR %l = load i8, ptr %ptr.iv, align 1 +; CHECK-NEXT: IR %l.ext = sext i8 %l to i32 +; CHECK-NEXT: IR %mul = mul i32 %l.ext, 128 +; CHECK-NEXT: IR %mul.ext = sext i32 %mul to i64 +; CHECK-NEXT: IR %red.next = add i64 %red, %mul.ext +; CHECK-NEXT: IR %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 +; CHECK-NEXT: IR %ec = icmp eq ptr %ptr.iv, %end +; CHECK-NEXT: No successors +; CHECK-NEXT: } +entry: + br label %loop + +loop: + %ptr.iv = phi ptr [ %start, %entry ], [ %gep.iv.next, %loop ] + %red = phi i64 [ 0, %entry ], [ %red.next, %loop ] + %l = load i8, ptr %ptr.iv, align 1 + %l.ext = sext i8 %l to i32 + %mul = mul i32 %l.ext, 128 + %mul.ext = sext i32 %mul to i64 + %red.next = add i64 %red, %mul.ext + %gep.iv.next = getelementptr i8, ptr %ptr.iv, i64 1 + %ec = icmp eq ptr %ptr.iv, %end + br i1 %ec, label %exit, label %loop + +exit: + %red.next.lcssa = phi i64 [ %red.next, %loop ] + ret i64 %red.next.lcssa +} diff --git a/llvm/test/Transforms/SLPVectorizer/X86/parent-node-schedulable-with-multi-copyables.ll b/llvm/test/Transforms/SLPVectorizer/X86/parent-node-schedulable-with-multi-copyables.ll new file mode 100644 index 0000000..9e96e93 --- /dev/null +++ b/llvm/test/Transforms/SLPVectorizer/X86/parent-node-schedulable-with-multi-copyables.ll @@ -0,0 +1,170 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -passes=slp-vectorizer -S -slp-threshold=-100 -mtriple=x86_64-unknown-linux-gnu < %s | FileCheck %s + +define i64 @test(ptr %arg1, i64 %alloca.promoted344, i8 %load.311.i, i1 %load1.i) { +; CHECK-LABEL: define i64 @test( +; CHECK-SAME: ptr [[ARG1:%.*]], i64 [[ALLOCA_PROMOTED344:%.*]], i8 [[LOAD_311_I:%.*]], i1 [[LOAD1_I:%.*]]) { +; CHECK-NEXT: [[BB:.*]]: +; CHECK-NEXT: [[TMP0:%.*]] = insertelement <4 x i8> <i8 0, i8 0, i8 0, i8 poison>, i8 [[LOAD_311_I]], i32 3 +; CHECK-NEXT: [[TMP1:%.*]] = insertelement <4 x i8> <i8 poison, i8 poison, i8 0, i8 0>, i8 [[LOAD_311_I]], i32 0 +; CHECK-NEXT: [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[ALLOCA_PROMOTED344]], i32 0 +; CHECK-NEXT: br label %[[BB2:.*]] +; CHECK: [[BB2]]: +; CHECK-NEXT: [[TMP3:%.*]] = phi <2 x i64> [ zeroinitializer, %[[BB]] ], [ [[TMP28:%.*]], %[[BB12_8_I:.*]] ] +; CHECK-NEXT: [[TMP4:%.*]] = phi <8 x i8> [ zeroinitializer, %[[BB]] ], [ [[TMP29:%.*]], %[[BB12_8_I]] ] +; CHECK-NEXT: br i1 [[LOAD1_I]], label %[[SPAM_EXIT:.*]], label %[[BB4_LR_PH_I:.*]] +; CHECK: [[BB4_LR_PH_I]]: +; CHECK-NEXT: br i1 true, label %[[BB3_I_I_PEEL:.*]], label %[[EGGS_EXIT_I_PEEL:.*]] +; CHECK: [[BB3_I_I_PEEL]]: +; CHECK-NEXT: [[TMP5:%.*]] = and <2 x i64> [[TMP3]], splat (i64 1) +; CHECK-NEXT: [[LOAD4_I_I_PEEL:%.*]] = load i64, ptr [[ARG1]], align 8 +; CHECK-NEXT: [[SHL_I_I_PEEL:%.*]] = shl i64 [[LOAD4_I_I_PEEL]], 1 +; CHECK-NEXT: [[TMP6:%.*]] = shufflevector <2 x i64> [[TMP3]], <2 x i64> poison, <2 x i32> <i32 poison, i32 0> +; CHECK-NEXT: [[TMP7:%.*]] = insertelement <2 x i64> [[TMP6]], i64 [[SHL_I_I_PEEL]], i32 0 +; CHECK-NEXT: [[TMP8:%.*]] = or <2 x i64> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP9:%.*]] = xor <2 x i64> [[TMP5]], [[TMP7]] +; CHECK-NEXT: [[TMP10:%.*]] = shufflevector <2 x i64> [[TMP8]], <2 x i64> [[TMP9]], <2 x i32> <i32 0, i32 3> +; CHECK-NEXT: br label %[[EGGS_EXIT_I_PEEL]] +; CHECK: [[EGGS_EXIT_I_PEEL]]: +; CHECK-NEXT: [[TMP11:%.*]] = phi <2 x i64> [ [[TMP10]], %[[BB3_I_I_PEEL]] ], [ zeroinitializer, %[[BB4_LR_PH_I]] ] +; CHECK-NEXT: [[TMP12:%.*]] = shufflevector <2 x i64> [[TMP11]], <2 x i64> poison, <4 x i32> <i32 0, i32 1, i32 0, i32 0> +; CHECK-NEXT: [[TMP13:%.*]] = trunc <4 x i64> [[TMP12]] to <4 x i8> +; CHECK-NEXT: [[TMP14:%.*]] = extractelement <4 x i64> [[TMP12]], i32 1 +; CHECK-NEXT: br label %[[SPAM_EXIT]] +; CHECK: [[SPAM_EXIT]]: +; CHECK-NEXT: [[GETELEMENTPTR_I_I_PROMOTED346:%.*]] = phi i64 [ [[TMP14]], %[[EGGS_EXIT_I_PEEL]] ], [ 0, %[[BB2]] ] +; CHECK-NEXT: [[LOAD_8_I:%.*]] = phi i8 [ 0, %[[EGGS_EXIT_I_PEEL]] ], [ 1, %[[BB2]] ] +; CHECK-NEXT: [[TMP15:%.*]] = phi <4 x i8> [ [[TMP13]], %[[EGGS_EXIT_I_PEEL]] ], [ zeroinitializer, %[[BB2]] ] +; CHECK-NEXT: [[TMP16:%.*]] = shufflevector <4 x i8> [[TMP15]], <4 x i8> poison, <4 x i32> <i32 2, i32 2, i32 2, i32 2> +; CHECK-NEXT: br i1 [[LOAD1_I]], label %[[BB12_8_I]], label %[[BB12_1_THREAD_I:.*]] +; CHECK: [[BB12_1_THREAD_I]]: +; CHECK-NEXT: [[TMP17:%.*]] = extractelement <8 x i8> [[TMP4]], i32 0 +; CHECK-NEXT: [[ICMP5_3_I:%.*]] = icmp eq i8 [[TMP17]], 0 +; CHECK-NEXT: br i1 [[ICMP5_3_I]], label %[[BB12_3_I:.*]], label %[[BB8_3_I:.*]] +; CHECK: [[BB8_3_I]]: +; CHECK-NEXT: br label %[[BB12_3_I]] +; CHECK: [[BB12_3_I]]: +; CHECK-NEXT: [[TMP18:%.*]] = extractelement <8 x i8> [[TMP4]], i32 1 +; CHECK-NEXT: [[ICMP5_4_I:%.*]] = icmp eq i8 [[TMP18]], 0 +; CHECK-NEXT: br i1 [[ICMP5_4_I]], label %[[BB12_4_I:.*]], label %[[BB8_4_I:.*]] +; CHECK: [[BB8_4_I]]: +; CHECK-NEXT: br label %[[BB12_4_I]] +; CHECK: [[BB12_4_I]]: +; CHECK-NEXT: [[TMP19:%.*]] = extractelement <8 x i8> [[TMP4]], i32 2 +; CHECK-NEXT: [[ICMP5_5_I:%.*]] = icmp eq i8 [[TMP19]], 0 +; CHECK-NEXT: br i1 [[ICMP5_5_I]], label %[[BB12_5_I:.*]], label %[[BB8_5_I:.*]] +; CHECK: [[BB8_5_I]]: +; CHECK-NEXT: br label %[[BB12_5_I]] +; CHECK: [[BB12_5_I]]: +; CHECK-NEXT: [[TMP20:%.*]] = extractelement <8 x i8> [[TMP4]], i32 3 +; CHECK-NEXT: [[ICMP5_7_I:%.*]] = icmp eq i8 [[TMP20]], 0 +; CHECK-NEXT: br i1 [[ICMP5_7_I]], label %[[BB12_7_I:.*]], label %[[BB8_7_I:.*]] +; CHECK: [[BB8_7_I]]: +; CHECK-NEXT: br label %[[BB12_7_I]] +; CHECK: [[BB12_7_I]]: +; CHECK-NEXT: [[TMP21:%.*]] = extractelement <8 x i8> [[TMP4]], i32 4 +; CHECK-NEXT: [[ICMP5_8_I:%.*]] = icmp eq i8 [[TMP21]], 0 +; CHECK-NEXT: br i1 [[ICMP5_8_I]], label %[[BB12_8_I]], label %[[BB8_8_I:.*]] +; CHECK: [[BB8_8_I]]: +; CHECK-NEXT: [[TMP22:%.*]] = insertelement <4 x i8> [[TMP1]], i8 [[LOAD_8_I]], i32 1 +; CHECK-NEXT: [[TMP23:%.*]] = insertelement <4 x i8> poison, i8 [[LOAD_8_I]], i32 0 +; CHECK-NEXT: [[TMP24:%.*]] = shufflevector <8 x i8> [[TMP4]], <8 x i8> poison, <4 x i32> <i32 poison, i32 5, i32 6, i32 7> +; CHECK-NEXT: [[TMP25:%.*]] = shufflevector <4 x i8> [[TMP23]], <4 x i8> [[TMP24]], <4 x i32> <i32 0, i32 5, i32 6, i32 7> +; CHECK-NEXT: br label %[[BB12_8_I]] +; CHECK: [[BB12_8_I]]: +; CHECK-NEXT: [[TMP26:%.*]] = phi <4 x i8> [ [[TMP0]], %[[BB12_7_I]] ], [ [[TMP22]], %[[BB8_8_I]] ], [ [[TMP15]], %[[SPAM_EXIT]] ] +; CHECK-NEXT: [[TMP27:%.*]] = phi <4 x i8> [ zeroinitializer, %[[BB12_7_I]] ], [ [[TMP25]], %[[BB8_8_I]] ], [ [[TMP16]], %[[SPAM_EXIT]] ] +; CHECK-NEXT: [[TMP28]] = insertelement <2 x i64> [[TMP2]], i64 [[GETELEMENTPTR_I_I_PROMOTED346]], i32 1 +; CHECK-NEXT: [[TMP29]] = shufflevector <4 x i8> [[TMP26]], <4 x i8> [[TMP27]], <8 x i32> <i32 2, i32 7, i32 5, i32 0, i32 1, i32 3, i32 4, i32 6> +; CHECK-NEXT: br label %[[BB2]] +; +bb: + br label %bb2 + +bb2: + %getelementptr.i.i.promoted = phi i64 [ 0, %bb ], [ %getelementptr.i.i.promoted346, %bb12.8.i ] + %alloca.promoted = phi i64 [ 0, %bb ], [ %alloca.promoted344, %bb12.8.i ] + %load.8.i231 = phi i8 [ 0, %bb ], [ %load.8.i239, %bb12.8.i ] + %load.7.i217 = phi i8 [ 0, %bb ], [ %load.7.i225, %bb12.8.i ] + %load.626.i200 = phi i8 [ 0, %bb ], [ %load.626.i208, %bb12.8.i ] + %load.6.i183 = phi i8 [ 0, %bb ], [ %load.6.i191, %bb12.8.i ] + %load.5.i167 = phi i8 [ 0, %bb ], [ %load.5.i175, %bb12.8.i ] + %load.418.i148 = phi i8 [ 0, %bb ], [ %load.418.i156, %bb12.8.i ] + %load.4.i129 = phi i8 [ 0, %bb ], [ %load.4.i137, %bb12.8.i ] + %load.3.i111 = phi i8 [ 0, %bb ], [ %load.3.i119, %bb12.8.i ] + br i1 %load1.i, label %spam.exit, label %bb4.lr.ph.i + +bb4.lr.ph.i: + br i1 true, label %bb3.i.i.peel, label %eggs.exit.i.peel + +bb3.i.i.peel: + %and.i.i.peel = and i64 %alloca.promoted, 1 + %load4.i.i.peel = load i64, ptr %arg1, align 8 + %shl.i.i.peel = shl i64 %load4.i.i.peel, 1 + %or.i.i.peel = or i64 %shl.i.i.peel, %and.i.i.peel + %and6.i.i.peel = and i64 %getelementptr.i.i.promoted, 1 + %xor.i.i.peel = xor i64 %and6.i.i.peel, %alloca.promoted + br label %eggs.exit.i.peel + +eggs.exit.i.peel: + %load5.i.i93.peel = phi i64 [ %xor.i.i.peel, %bb3.i.i.peel ], [ 0, %bb4.lr.ph.i ] + %or.i.i91.peel = phi i64 [ %or.i.i.peel, %bb3.i.i.peel ], [ 0, %bb4.lr.ph.i ] + %0 = trunc i64 %or.i.i91.peel to i8 + %1 = trunc nuw i64 %or.i.i91.peel to i8 + %2 = trunc i64 %load5.i.i93.peel to i8 + br label %spam.exit + +spam.exit: + %getelementptr.i.i.promoted346 = phi i64 [ %load5.i.i93.peel, %eggs.exit.i.peel ], [ 0, %bb2 ] + %load.834.i = phi i8 [ %2, %eggs.exit.i.peel ], [ 0, %bb2 ] + %load.7.i25 = phi i8 [ %1, %eggs.exit.i.peel ], [ 0, %bb2 ] + %load.8.i = phi i8 [ 0, %eggs.exit.i.peel ], [ 1, %bb2 ] + %load.6.i18 = phi i8 [ %0, %eggs.exit.i.peel ], [ 0, %bb2 ] + br i1 %load1.i, label %bb12.8.i, label %bb12.1.thread.i + +bb12.1.thread.i: + %icmp5.3.i = icmp eq i8 %load.3.i111, 0 + br i1 %icmp5.3.i, label %bb12.3.i, label %bb8.3.i + +bb8.3.i: + br label %bb12.3.i + +bb12.3.i: + %icmp5.4.i = icmp eq i8 %load.4.i129, 0 + br i1 %icmp5.4.i, label %bb12.4.i, label %bb8.4.i + +bb8.4.i: + br label %bb12.4.i + +bb12.4.i: + %icmp5.5.i = icmp eq i8 %load.5.i167, 0 + br i1 %icmp5.5.i, label %bb12.5.i, label %bb8.5.i + +bb8.5.i: + br label %bb12.5.i + +bb12.5.i: + %icmp5.7.i = icmp eq i8 %load.7.i217, 0 + br i1 %icmp5.7.i, label %bb12.7.i, label %bb8.7.i + +bb8.7.i: + br label %bb12.7.i + +bb12.7.i: + %icmp5.8.i = icmp eq i8 %load.8.i231, 0 + br i1 %icmp5.8.i, label %bb12.8.i, label %bb8.8.i + +bb8.8.i: + br label %bb12.8.i + +bb12.8.i: + %load.8.i239 = phi i8 [ 0, %bb12.7.i ], [ %load.8.i, %bb8.8.i ], [ %load.834.i, %spam.exit ] + %load.7.i225 = phi i8 [ 0, %bb12.7.i ], [ %load.311.i, %bb8.8.i ], [ %load.7.i25, %spam.exit ] + %load.626.i208 = phi i8 [ 0, %bb12.7.i ], [ %load.8.i, %bb8.8.i ], [ %load.6.i18, %spam.exit ] + %load.6.i191 = phi i8 [ %load.311.i, %bb12.7.i ], [ 0, %bb8.8.i ], [ %load.6.i18, %spam.exit ] + %load.5.i175 = phi i8 [ 0, %bb12.7.i ], [ %load.6.i183, %bb8.8.i ], [ %load.6.i18, %spam.exit ] + %load.418.i156 = phi i8 [ 0, %bb12.7.i ], [ %load.626.i200, %bb8.8.i ], [ %load.6.i18, %spam.exit ] + %load.4.i137 = phi i8 [ 0, %bb12.7.i ], [ %load.418.i148, %bb8.8.i ], [ %load.6.i18, %spam.exit ] + %load.3.i119 = phi i8 [ 0, %bb12.7.i ], [ 0, %bb8.8.i ], [ %load.6.i18, %spam.exit ] + br label %bb2 +} diff --git a/llvm/test/Transforms/SimplifyCFG/pr165088.ll b/llvm/test/Transforms/SimplifyCFG/pr165088.ll new file mode 100644 index 0000000..4514a19 --- /dev/null +++ b/llvm/test/Transforms/SimplifyCFG/pr165088.ll @@ -0,0 +1,186 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 6 +; RUN: opt -S -passes="simplifycfg<switch-range-to-icmp>" < %s | FileCheck %s + +; Avoid getting stuck in the cycle pr165088_cycle_[1-4]. + +define void @pr165088_cycle_1(i8 %x) { +; CHECK-LABEL: define void @pr165088_cycle_1( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult i8 [[X]], 2 +; CHECK-NEXT: br i1 [[TMP0]], label %[[BLOCK2:.*]], label %[[BLOCK3:.*]] +; CHECK: [[BLOCK1:.*]]: +; CHECK-NEXT: [[COND2:%.*]] = icmp ugt i8 [[X]], 1 +; CHECK-NEXT: br i1 [[COND2]], label %[[BLOCK3]], label %[[BLOCK2]] +; CHECK: [[BLOCK2]]: +; CHECK-NEXT: br label %[[BLOCK3]] +; CHECK: [[BLOCK3]]: +; CHECK-NEXT: [[COND3:%.*]] = icmp eq i8 [[X]], 0 +; CHECK-NEXT: br i1 [[COND3]], label %[[EXIT:.*]], label %[[BLOCK1]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + %switch = icmp uge i8 %x, 2 + %cond1 = icmp ugt i8 %x, 1 + %or.cond = and i1 %switch, %cond1 + br i1 %or.cond, label %block3, label %block2 + +block1: + %cond2 = icmp ugt i8 %x, 1 + br i1 %cond2, label %block3, label %block2 + +block2: + br label %block3 + +block3: + %cond3 = icmp eq i8 %x, 0 + br i1 %cond3, label %exit, label %block1 + +exit: + ret void +} + +define void @pr165088_cycle_2(i8 %x) { +; CHECK-LABEL: define void @pr165088_cycle_2( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[SWITCH:%.*]] = icmp ult i8 [[X]], 2 +; CHECK-NEXT: br i1 [[SWITCH]], label %[[BLOCK2:.*]], label %[[BLOCK3:.*]] +; CHECK: [[BLOCK1:.*]]: +; CHECK-NEXT: [[COND2:%.*]] = icmp ugt i8 [[X]], 1 +; CHECK-NEXT: br i1 [[COND2]], label %[[BLOCK3]], label %[[BLOCK2]] +; CHECK: [[BLOCK2]]: +; CHECK-NEXT: br label %[[BLOCK3]] +; CHECK: [[BLOCK3]]: +; CHECK-NEXT: [[COND3:%.*]] = icmp eq i8 [[X]], 0 +; CHECK-NEXT: br i1 [[COND3]], label %[[EXIT:.*]], label %[[BLOCK1]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + switch i8 %x, label %block3 [ + i8 1, label %block2 + i8 0, label %block2 + ] + +block1: ; preds = %block3 + %cond2 = icmp ugt i8 %x, 1 + br i1 %cond2, label %block3, label %block2 + +block2: ; preds = %entry, %entry, %block1 + br label %block3 + +block3: ; preds = %entry, %block2, %block1 + %cond3 = icmp eq i8 %x, 0 + br i1 %cond3, label %exit, label %block1 + +exit: ; preds = %block3 + ret void +} + +define void @pr165088_cycle_3(i8 %x) { +; CHECK-LABEL: define void @pr165088_cycle_3( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: br label %[[BLOCK3:.*]] +; CHECK: [[BLOCK3]]: +; CHECK-NEXT: [[COND3:%.*]] = icmp eq i8 [[X]], 0 +; CHECK-NEXT: br i1 [[COND3]], label %[[EXIT:.*]], label %[[BLOCK3]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + switch i8 %x, label %block1 [ + i8 1, label %block2 + i8 0, label %block2 + ] + +block1: ; preds = %entry, %block3 + %cond2 = icmp ugt i8 %x, 1 + br i1 %cond2, label %block3, label %block2 + +block2: ; preds = %entry, %entry, %block1 + br label %block3 + +block3: ; preds = %block2, %block1 + %cond3 = icmp eq i8 %x, 0 + br i1 %cond3, label %exit, label %block1 + +exit: ; preds = %block3 + ret void +} + +define void @pr165088_cycle_4(i8 %x) { +; CHECK-LABEL: define void @pr165088_cycle_4( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult i8 [[X]], 2 +; CHECK-NEXT: br i1 [[TMP0]], label %[[BLOCK2:.*]], label %[[BLOCK3:.*]] +; CHECK: [[BLOCK1:.*]]: +; CHECK-NEXT: [[COND2_OLD:%.*]] = icmp ugt i8 [[X]], 1 +; CHECK-NEXT: br i1 [[COND2_OLD]], label %[[BLOCK3]], label %[[BLOCK2]] +; CHECK: [[BLOCK2]]: +; CHECK-NEXT: br label %[[BLOCK3]] +; CHECK: [[BLOCK3]]: +; CHECK-NEXT: [[COND3:%.*]] = icmp eq i8 [[X]], 0 +; CHECK-NEXT: br i1 [[COND3]], label %[[EXIT:.*]], label %[[BLOCK1]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + %switch = icmp ult i8 %x, 2 + br i1 %switch, label %block2, label %block1 + +block1: ; preds = %entry, %block3 + %cond2 = icmp ugt i8 %x, 1 + br i1 %cond2, label %block3, label %block2 + +block2: ; preds = %entry, %block1 + br label %block3 + +block3: ; preds = %block2, %block1 + %cond3 = icmp eq i8 %x, 0 + br i1 %cond3, label %exit, label %block1 + +exit: ; preds = %block3 + ret void +} + +define void @pr165088_original(i8 %x) { +; CHECK-LABEL: define void @pr165088_original( +; CHECK-SAME: i8 [[X:%.*]]) { +; CHECK-NEXT: [[ENTRY:.*:]] +; CHECK-NEXT: [[TMP0:%.*]] = icmp ult i8 [[X]], 2 +; CHECK-NEXT: br i1 [[TMP0]], label %[[BLOCK2:.*]], label %[[BLOCK3:.*]] +; CHECK: [[BLOCK1:.*]]: +; CHECK-NEXT: [[COND3_OLD_OLD:%.*]] = icmp ugt i8 [[X]], 1 +; CHECK-NEXT: br i1 [[COND3_OLD_OLD]], label %[[BLOCK3]], label %[[BLOCK2]] +; CHECK: [[BLOCK2]]: +; CHECK-NEXT: br label %[[BLOCK3]] +; CHECK: [[BLOCK3]]: +; CHECK-NEXT: [[COND4:%.*]] = icmp eq i8 [[X]], 0 +; CHECK-NEXT: br i1 [[COND4]], label %[[EXIT:.*]], label %[[BLOCK1]] +; CHECK: [[EXIT]]: +; CHECK-NEXT: ret void +; +entry: + %cond = icmp ne i8 %x, 0 + %cond3 = icmp ne i8 %x, 0 + %or.cond = and i1 %cond, %cond3 + br i1 %or.cond, label %block3, label %block2 + +block1: ; preds = %block3 + %cond3.old = icmp ugt i8 %x, 1 + br i1 %cond3.old, label %block3, label %block2 + +block2: ; preds = %block1, %entry + br label %block3 + +block3: ; preds = %block2, %block1, %entry + %cond4 = icmp eq i8 %x, 0 + br i1 %cond4, label %exit, label %block1 + +exit: ; preds = %block3 + ret void +} |
