diff options
Diffstat (limited to 'llvm/test/CodeGen')
19 files changed, 2469 insertions, 968 deletions
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll index 41f7ab8..480fcbd 100644 --- a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll +++ b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll @@ -4992,28 +4992,21 @@ define void @test_shl_i512_const_32(ptr %result, ptr %input) { ; GISEL-LABEL: test_shl_i512_const_32: ; GISEL: ; %bb.0: ; %entry ; GISEL-NEXT: ldp x8, x9, [x1] -; GISEL-NEXT: ldp x11, x12, [x1, #16] -; GISEL-NEXT: ldp x14, x15, [x1, #32] -; GISEL-NEXT: lsr x10, x8, #32 -; GISEL-NEXT: lsr x13, x9, #32 -; GISEL-NEXT: lsl x8, x8, #32 -; GISEL-NEXT: orr x9, x10, x9, lsl #32 -; GISEL-NEXT: lsr x10, x11, #32 -; GISEL-NEXT: orr x11, x13, x11, lsl #32 -; GISEL-NEXT: ldp x13, x16, [x1, #48] -; GISEL-NEXT: stp x8, x9, [x0] -; GISEL-NEXT: lsr x8, x12, #32 -; GISEL-NEXT: orr x10, x10, x12, lsl #32 -; GISEL-NEXT: lsr x12, x14, #32 -; GISEL-NEXT: lsr x9, x15, #32 -; GISEL-NEXT: orr x8, x8, x14, lsl #32 -; GISEL-NEXT: stp x11, x10, [x0, #16] -; GISEL-NEXT: orr x11, x12, x15, lsl #32 -; GISEL-NEXT: lsr x12, x13, #32 -; GISEL-NEXT: orr x9, x9, x13, lsl #32 -; GISEL-NEXT: stp x8, x11, [x0, #32] -; GISEL-NEXT: orr x8, x12, x16, lsl #32 -; GISEL-NEXT: stp x9, x8, [x0, #48] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x13, x14, [x1, #32] +; GISEL-NEXT: lsl x12, x8, #32 +; GISEL-NEXT: extr x8, x9, x8, #32 +; GISEL-NEXT: extr x9, x10, x9, #32 +; GISEL-NEXT: extr x10, x11, x10, #32 +; GISEL-NEXT: ldp x15, x16, [x1, #48] +; GISEL-NEXT: stp x12, x8, [x0] +; GISEL-NEXT: extr x8, x13, x11, #32 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: extr x9, x14, x13, #32 +; GISEL-NEXT: extr x10, x15, x14, #32 +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: extr x8, x16, x15, #32 +; GISEL-NEXT: stp x10, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5044,30 +5037,22 @@ define void @test_lshr_i512_const_32(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_32: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x14, [x1, #24] -; GISEL-NEXT: ldr x16, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #32 -; GISEL-NEXT: lsl x13, x9, #32 -; GISEL-NEXT: lsl x15, x10, #32 -; GISEL-NEXT: orr x11, x12, x11, lsr #32 -; GISEL-NEXT: orr x8, x13, x8, lsr #32 -; GISEL-NEXT: lsl x13, x14, #32 -; GISEL-NEXT: orr x9, x15, x9, lsr #32 -; GISEL-NEXT: ldp x12, x15, [x1, #40] -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: orr x10, x13, x10, lsr #32 -; GISEL-NEXT: lsl x8, x16, #32 -; GISEL-NEXT: lsl x11, x12, #32 -; GISEL-NEXT: lsl x13, x15, #32 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x8, x8, x15, lsr #32 -; GISEL-NEXT: lsr x10, x16, #32 -; GISEL-NEXT: orr x11, x11, x14, lsr #32 -; GISEL-NEXT: orr x9, x13, x12, lsr #32 -; GISEL-NEXT: stp x8, x10, [x0, #48] -; GISEL-NEXT: stp x11, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: extr x8, x9, x8, #32 +; GISEL-NEXT: ldp x14, x15, [x1, #48] +; GISEL-NEXT: extr x9, x10, x9, #32 +; GISEL-NEXT: extr x10, x11, x10, #32 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #32 +; GISEL-NEXT: extr x9, x13, x12, #32 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #32 +; GISEL-NEXT: extr x8, x15, x14, #32 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: lsr x9, x15, #32 +; GISEL-NEXT: stp x8, x9, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5098,32 +5083,24 @@ define void @test_ashr_i512_const_32(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_32: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x13, [x1, #24] -; GISEL-NEXT: ldr x17, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #32 -; GISEL-NEXT: lsl x15, x9, #32 -; GISEL-NEXT: lsl x16, x10, #32 -; GISEL-NEXT: orr x11, x12, x11, lsr #32 -; GISEL-NEXT: ldp x14, x12, [x1, #40] -; GISEL-NEXT: orr x8, x15, x8, lsr #32 -; GISEL-NEXT: lsl x15, x13, #32 -; GISEL-NEXT: orr x9, x16, x9, lsr #32 -; GISEL-NEXT: asr x16, x17, #63 -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: lsl x11, x14, #32 -; GISEL-NEXT: orr x10, x15, x10, lsr #32 -; GISEL-NEXT: lsl x15, x12, #32 -; GISEL-NEXT: orr x8, x11, x13, lsr #32 -; GISEL-NEXT: lsl x11, x17, #32 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x9, x15, x14, lsr #32 -; GISEL-NEXT: lsl x13, x16, #32 -; GISEL-NEXT: orr x10, x11, x12, lsr #32 -; GISEL-NEXT: stp x8, x9, [x0, #32] -; GISEL-NEXT: orr x8, x13, x17, asr #32 -; GISEL-NEXT: stp x10, x8, [x0, #48] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #48] +; GISEL-NEXT: extr x8, x9, x8, #32 +; GISEL-NEXT: ldp x14, x15, [x1, #32] +; GISEL-NEXT: extr x9, x10, x9, #32 +; GISEL-NEXT: extr x10, x11, x10, #32 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: asr x8, x13, #63 +; GISEL-NEXT: extr x11, x14, x11, #32 +; GISEL-NEXT: extr x9, x15, x14, #32 +; GISEL-NEXT: lsl x8, x8, #32 +; GISEL-NEXT: stp x10, x11, [x0, #16] +; GISEL-NEXT: extr x10, x12, x15, #32 +; GISEL-NEXT: extr x11, x13, x12, #32 +; GISEL-NEXT: orr x8, x8, x13, asr #32 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x11, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5252,23 +5229,17 @@ define void @test_shl_i512_const_96(ptr %result, ptr %input) { ; GISEL-NEXT: ldr x15, [x1, #48] ; GISEL-NEXT: ldp x10, x11, [x1, #16] ; GISEL-NEXT: ldp x12, x13, [x1, #32] -; GISEL-NEXT: lsr x14, x8, #32 -; GISEL-NEXT: lsr x16, x9, #32 -; GISEL-NEXT: lsl x8, x8, #32 -; GISEL-NEXT: orr x9, x14, x9, lsl #32 -; GISEL-NEXT: lsr x14, x10, #32 -; GISEL-NEXT: orr x10, x16, x10, lsl #32 -; GISEL-NEXT: stp xzr, x8, [x0] -; GISEL-NEXT: lsr x8, x11, #32 -; GISEL-NEXT: orr x11, x14, x11, lsl #32 -; GISEL-NEXT: lsr x14, x12, #32 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: lsr x9, x13, #32 -; GISEL-NEXT: orr x8, x8, x12, lsl #32 -; GISEL-NEXT: orr x10, x14, x13, lsl #32 -; GISEL-NEXT: orr x9, x9, x15, lsl #32 -; GISEL-NEXT: stp x11, x8, [x0, #32] -; GISEL-NEXT: stp x10, x9, [x0, #48] +; GISEL-NEXT: lsl x14, x8, #32 +; GISEL-NEXT: extr x8, x9, x8, #32 +; GISEL-NEXT: extr x9, x10, x9, #32 +; GISEL-NEXT: extr x10, x11, x10, #32 +; GISEL-NEXT: stp xzr, x14, [x0] +; GISEL-NEXT: stp x8, x9, [x0, #16] +; GISEL-NEXT: extr x8, x12, x11, #32 +; GISEL-NEXT: extr x9, x13, x12, #32 +; GISEL-NEXT: stp x10, x8, [x0, #32] +; GISEL-NEXT: extr x10, x15, x13, #32 +; GISEL-NEXT: stp x9, x10, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5297,27 +5268,21 @@ define void @test_lshr_i512_const_96(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_96: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x10, [x1, #8] -; GISEL-NEXT: ldp x11, x14, [x1, #32] -; GISEL-NEXT: ldp x15, x16, [x1, #48] -; GISEL-NEXT: lsl x12, x8, #32 -; GISEL-NEXT: lsl x13, x9, #32 -; GISEL-NEXT: orr x10, x12, x10, lsr #32 -; GISEL-NEXT: lsl x12, x11, #32 -; GISEL-NEXT: orr x8, x13, x8, lsr #32 -; GISEL-NEXT: lsl x13, x14, #32 -; GISEL-NEXT: orr x9, x12, x9, lsr #32 -; GISEL-NEXT: stp x10, x8, [x0] -; GISEL-NEXT: lsl x10, x15, #32 -; GISEL-NEXT: orr x11, x13, x11, lsr #32 -; GISEL-NEXT: lsl x12, x16, #32 -; GISEL-NEXT: orr x8, x10, x14, lsr #32 -; GISEL-NEXT: lsr x10, x16, #32 -; GISEL-NEXT: stp x9, x11, [x0, #16] -; GISEL-NEXT: orr x9, x12, x15, lsr #32 -; GISEL-NEXT: stp x10, xzr, [x0, #48] -; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x14, [x1, #56] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x12, x13, [x1, #40] +; GISEL-NEXT: extr x8, x9, x8, #32 +; GISEL-NEXT: extr x9, x10, x9, #32 +; GISEL-NEXT: extr x10, x11, x10, #32 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #32 +; GISEL-NEXT: extr x9, x13, x12, #32 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #32 +; GISEL-NEXT: lsr x8, x14, #32 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, xzr, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5347,29 +5312,23 @@ define void @test_ashr_i512_const_96(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_96: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x11, [x1, #8] -; GISEL-NEXT: ldp x10, x13, [x1, #32] -; GISEL-NEXT: lsl x12, x8, #32 -; GISEL-NEXT: lsl x14, x9, #32 -; GISEL-NEXT: lsl x15, x10, #32 -; GISEL-NEXT: orr x11, x12, x11, lsr #32 -; GISEL-NEXT: ldp x12, x16, [x1, #48] -; GISEL-NEXT: orr x8, x14, x8, lsr #32 -; GISEL-NEXT: lsl x14, x13, #32 -; GISEL-NEXT: orr x9, x15, x9, lsr #32 -; GISEL-NEXT: asr x15, x16, #63 -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: lsl x11, x12, #32 -; GISEL-NEXT: orr x10, x14, x10, lsr #32 -; GISEL-NEXT: lsl x14, x16, #32 -; GISEL-NEXT: orr x8, x11, x13, lsr #32 +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x13, [x1, #40] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x14, x12, [x1, #48] +; GISEL-NEXT: extr x8, x9, x8, #32 +; GISEL-NEXT: extr x9, x10, x9, #32 +; GISEL-NEXT: extr x10, x11, x10, #32 +; GISEL-NEXT: asr x15, x12, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x13, x11, #32 +; GISEL-NEXT: extr x9, x14, x13, #32 ; GISEL-NEXT: lsl x11, x15, #32 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x9, x14, x12, lsr #32 -; GISEL-NEXT: orr x10, x11, x16, asr #32 -; GISEL-NEXT: stp x8, x9, [x0, #32] -; GISEL-NEXT: stp x10, x15, [x0, #48] +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x12, x14, #32 +; GISEL-NEXT: orr x8, x11, x12, asr #32 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, x15, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5404,28 +5363,21 @@ define void @test_shl_i512_const_1(ptr %result, ptr %input) { ; GISEL-LABEL: test_shl_i512_const_1: ; GISEL: ; %bb.0: ; %entry ; GISEL-NEXT: ldp x8, x9, [x1] -; GISEL-NEXT: ldp x11, x12, [x1, #16] -; GISEL-NEXT: ldp x14, x15, [x1, #32] -; GISEL-NEXT: lsr x10, x8, #63 -; GISEL-NEXT: lsr x13, x9, #63 -; GISEL-NEXT: lsl x8, x8, #1 -; GISEL-NEXT: orr x9, x10, x9, lsl #1 -; GISEL-NEXT: lsr x10, x11, #63 -; GISEL-NEXT: orr x11, x13, x11, lsl #1 -; GISEL-NEXT: ldp x13, x16, [x1, #48] -; GISEL-NEXT: stp x8, x9, [x0] -; GISEL-NEXT: lsr x8, x12, #63 -; GISEL-NEXT: orr x10, x10, x12, lsl #1 -; GISEL-NEXT: lsr x12, x14, #63 -; GISEL-NEXT: lsr x9, x15, #63 -; GISEL-NEXT: orr x8, x8, x14, lsl #1 -; GISEL-NEXT: stp x11, x10, [x0, #16] -; GISEL-NEXT: orr x11, x12, x15, lsl #1 -; GISEL-NEXT: lsr x12, x13, #63 -; GISEL-NEXT: orr x9, x9, x13, lsl #1 -; GISEL-NEXT: stp x8, x11, [x0, #32] -; GISEL-NEXT: orr x8, x12, x16, lsl #1 -; GISEL-NEXT: stp x9, x8, [x0, #48] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x13, x14, [x1, #32] +; GISEL-NEXT: lsl x12, x8, #1 +; GISEL-NEXT: extr x8, x9, x8, #63 +; GISEL-NEXT: extr x9, x10, x9, #63 +; GISEL-NEXT: extr x10, x11, x10, #63 +; GISEL-NEXT: ldp x15, x16, [x1, #48] +; GISEL-NEXT: stp x12, x8, [x0] +; GISEL-NEXT: extr x8, x13, x11, #63 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: extr x9, x14, x13, #63 +; GISEL-NEXT: extr x10, x15, x14, #63 +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: extr x8, x16, x15, #63 +; GISEL-NEXT: stp x10, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5457,30 +5409,22 @@ define void @test_lshr_i512_const_1(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_1: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x14, [x1, #24] -; GISEL-NEXT: ldr x16, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #63 -; GISEL-NEXT: lsl x13, x9, #63 -; GISEL-NEXT: lsl x15, x10, #63 -; GISEL-NEXT: orr x11, x12, x11, lsr #1 -; GISEL-NEXT: orr x8, x13, x8, lsr #1 -; GISEL-NEXT: lsl x13, x14, #63 -; GISEL-NEXT: orr x9, x15, x9, lsr #1 -; GISEL-NEXT: ldp x12, x15, [x1, #40] -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: orr x10, x13, x10, lsr #1 -; GISEL-NEXT: lsl x8, x16, #63 -; GISEL-NEXT: lsl x11, x12, #63 -; GISEL-NEXT: lsl x13, x15, #63 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x8, x8, x15, lsr #1 -; GISEL-NEXT: lsr x10, x16, #1 -; GISEL-NEXT: orr x11, x11, x14, lsr #1 -; GISEL-NEXT: orr x9, x13, x12, lsr #1 -; GISEL-NEXT: stp x8, x10, [x0, #48] -; GISEL-NEXT: stp x11, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: extr x8, x9, x8, #1 +; GISEL-NEXT: ldp x14, x15, [x1, #48] +; GISEL-NEXT: extr x9, x10, x9, #1 +; GISEL-NEXT: extr x10, x11, x10, #1 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #1 +; GISEL-NEXT: extr x9, x13, x12, #1 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #1 +; GISEL-NEXT: extr x8, x15, x14, #1 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: lsr x9, x15, #1 +; GISEL-NEXT: stp x8, x9, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5512,32 +5456,24 @@ define void @test_ashr_i512_const_1(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_1: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x13, [x1, #24] -; GISEL-NEXT: ldr x17, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #63 -; GISEL-NEXT: lsl x15, x9, #63 -; GISEL-NEXT: lsl x16, x10, #63 -; GISEL-NEXT: orr x11, x12, x11, lsr #1 -; GISEL-NEXT: ldp x14, x12, [x1, #40] -; GISEL-NEXT: orr x8, x15, x8, lsr #1 -; GISEL-NEXT: lsl x15, x13, #63 -; GISEL-NEXT: orr x9, x16, x9, lsr #1 -; GISEL-NEXT: asr x16, x17, #63 -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: lsl x11, x14, #63 -; GISEL-NEXT: orr x10, x15, x10, lsr #1 -; GISEL-NEXT: lsl x15, x12, #63 -; GISEL-NEXT: orr x8, x11, x13, lsr #1 -; GISEL-NEXT: lsl x11, x17, #63 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x9, x15, x14, lsr #1 -; GISEL-NEXT: lsl x13, x16, #63 -; GISEL-NEXT: orr x10, x11, x12, lsr #1 -; GISEL-NEXT: stp x8, x9, [x0, #32] -; GISEL-NEXT: orr x8, x13, x17, asr #1 -; GISEL-NEXT: stp x10, x8, [x0, #48] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #48] +; GISEL-NEXT: extr x8, x9, x8, #1 +; GISEL-NEXT: ldp x14, x15, [x1, #32] +; GISEL-NEXT: extr x9, x10, x9, #1 +; GISEL-NEXT: extr x10, x11, x10, #1 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: asr x8, x13, #63 +; GISEL-NEXT: extr x11, x14, x11, #1 +; GISEL-NEXT: extr x9, x15, x14, #1 +; GISEL-NEXT: lsl x8, x8, #63 +; GISEL-NEXT: stp x10, x11, [x0, #16] +; GISEL-NEXT: extr x10, x12, x15, #1 +; GISEL-NEXT: extr x11, x13, x12, #1 +; GISEL-NEXT: orr x8, x8, x13, asr #1 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x11, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5571,28 +5507,21 @@ define void @test_shl_i512_const_15(ptr %result, ptr %input) { ; GISEL-LABEL: test_shl_i512_const_15: ; GISEL: ; %bb.0: ; %entry ; GISEL-NEXT: ldp x8, x9, [x1] -; GISEL-NEXT: ldp x11, x12, [x1, #16] -; GISEL-NEXT: ldp x14, x15, [x1, #32] -; GISEL-NEXT: lsr x10, x8, #49 -; GISEL-NEXT: lsr x13, x9, #49 -; GISEL-NEXT: lsl x8, x8, #15 -; GISEL-NEXT: orr x9, x10, x9, lsl #15 -; GISEL-NEXT: lsr x10, x11, #49 -; GISEL-NEXT: orr x11, x13, x11, lsl #15 -; GISEL-NEXT: ldp x13, x16, [x1, #48] -; GISEL-NEXT: stp x8, x9, [x0] -; GISEL-NEXT: lsr x8, x12, #49 -; GISEL-NEXT: orr x10, x10, x12, lsl #15 -; GISEL-NEXT: lsr x12, x14, #49 -; GISEL-NEXT: lsr x9, x15, #49 -; GISEL-NEXT: orr x8, x8, x14, lsl #15 -; GISEL-NEXT: stp x11, x10, [x0, #16] -; GISEL-NEXT: orr x11, x12, x15, lsl #15 -; GISEL-NEXT: lsr x12, x13, #49 -; GISEL-NEXT: orr x9, x9, x13, lsl #15 -; GISEL-NEXT: stp x8, x11, [x0, #32] -; GISEL-NEXT: orr x8, x12, x16, lsl #15 -; GISEL-NEXT: stp x9, x8, [x0, #48] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x13, x14, [x1, #32] +; GISEL-NEXT: lsl x12, x8, #15 +; GISEL-NEXT: extr x8, x9, x8, #49 +; GISEL-NEXT: extr x9, x10, x9, #49 +; GISEL-NEXT: extr x10, x11, x10, #49 +; GISEL-NEXT: ldp x15, x16, [x1, #48] +; GISEL-NEXT: stp x12, x8, [x0] +; GISEL-NEXT: extr x8, x13, x11, #49 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: extr x9, x14, x13, #49 +; GISEL-NEXT: extr x10, x15, x14, #49 +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: extr x8, x16, x15, #49 +; GISEL-NEXT: stp x10, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5624,30 +5553,22 @@ define void @test_lshr_i512_const_15(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_15: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x14, [x1, #24] -; GISEL-NEXT: ldr x16, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #49 -; GISEL-NEXT: lsl x13, x9, #49 -; GISEL-NEXT: lsl x15, x10, #49 -; GISEL-NEXT: orr x11, x12, x11, lsr #15 -; GISEL-NEXT: orr x8, x13, x8, lsr #15 -; GISEL-NEXT: lsl x13, x14, #49 -; GISEL-NEXT: orr x9, x15, x9, lsr #15 -; GISEL-NEXT: ldp x12, x15, [x1, #40] -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: orr x10, x13, x10, lsr #15 -; GISEL-NEXT: lsl x8, x16, #49 -; GISEL-NEXT: lsl x11, x12, #49 -; GISEL-NEXT: lsl x13, x15, #49 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x8, x8, x15, lsr #15 -; GISEL-NEXT: lsr x10, x16, #15 -; GISEL-NEXT: orr x11, x11, x14, lsr #15 -; GISEL-NEXT: orr x9, x13, x12, lsr #15 -; GISEL-NEXT: stp x8, x10, [x0, #48] -; GISEL-NEXT: stp x11, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: extr x8, x9, x8, #15 +; GISEL-NEXT: ldp x14, x15, [x1, #48] +; GISEL-NEXT: extr x9, x10, x9, #15 +; GISEL-NEXT: extr x10, x11, x10, #15 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #15 +; GISEL-NEXT: extr x9, x13, x12, #15 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #15 +; GISEL-NEXT: extr x8, x15, x14, #15 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: lsr x9, x15, #15 +; GISEL-NEXT: stp x8, x9, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5679,32 +5600,24 @@ define void @test_ashr_i512_const_15(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_15: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x13, [x1, #24] -; GISEL-NEXT: ldr x17, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #49 -; GISEL-NEXT: lsl x15, x9, #49 -; GISEL-NEXT: lsl x16, x10, #49 -; GISEL-NEXT: orr x11, x12, x11, lsr #15 -; GISEL-NEXT: ldp x14, x12, [x1, #40] -; GISEL-NEXT: orr x8, x15, x8, lsr #15 -; GISEL-NEXT: lsl x15, x13, #49 -; GISEL-NEXT: orr x9, x16, x9, lsr #15 -; GISEL-NEXT: asr x16, x17, #63 -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: lsl x11, x14, #49 -; GISEL-NEXT: orr x10, x15, x10, lsr #15 -; GISEL-NEXT: lsl x15, x12, #49 -; GISEL-NEXT: orr x8, x11, x13, lsr #15 -; GISEL-NEXT: lsl x11, x17, #49 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x9, x15, x14, lsr #15 -; GISEL-NEXT: lsl x13, x16, #49 -; GISEL-NEXT: orr x10, x11, x12, lsr #15 -; GISEL-NEXT: stp x8, x9, [x0, #32] -; GISEL-NEXT: orr x8, x13, x17, asr #15 -; GISEL-NEXT: stp x10, x8, [x0, #48] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #48] +; GISEL-NEXT: extr x8, x9, x8, #15 +; GISEL-NEXT: ldp x14, x15, [x1, #32] +; GISEL-NEXT: extr x9, x10, x9, #15 +; GISEL-NEXT: extr x10, x11, x10, #15 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: asr x8, x13, #63 +; GISEL-NEXT: extr x11, x14, x11, #15 +; GISEL-NEXT: extr x9, x15, x14, #15 +; GISEL-NEXT: lsl x8, x8, #49 +; GISEL-NEXT: stp x10, x11, [x0, #16] +; GISEL-NEXT: extr x10, x12, x15, #15 +; GISEL-NEXT: extr x11, x13, x12, #15 +; GISEL-NEXT: orr x8, x8, x13, asr #15 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x11, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5738,28 +5651,21 @@ define void @test_shl_i512_const_63(ptr %result, ptr %input) { ; GISEL-LABEL: test_shl_i512_const_63: ; GISEL: ; %bb.0: ; %entry ; GISEL-NEXT: ldp x8, x9, [x1] -; GISEL-NEXT: ldp x11, x12, [x1, #16] -; GISEL-NEXT: ldp x14, x15, [x1, #32] -; GISEL-NEXT: lsr x10, x8, #1 -; GISEL-NEXT: lsr x13, x9, #1 -; GISEL-NEXT: lsl x8, x8, #63 -; GISEL-NEXT: orr x9, x10, x9, lsl #63 -; GISEL-NEXT: lsr x10, x11, #1 -; GISEL-NEXT: orr x11, x13, x11, lsl #63 -; GISEL-NEXT: ldp x13, x16, [x1, #48] -; GISEL-NEXT: stp x8, x9, [x0] -; GISEL-NEXT: lsr x8, x12, #1 -; GISEL-NEXT: orr x10, x10, x12, lsl #63 -; GISEL-NEXT: lsr x12, x14, #1 -; GISEL-NEXT: lsr x9, x15, #1 -; GISEL-NEXT: orr x8, x8, x14, lsl #63 -; GISEL-NEXT: stp x11, x10, [x0, #16] -; GISEL-NEXT: orr x11, x12, x15, lsl #63 -; GISEL-NEXT: lsr x12, x13, #1 -; GISEL-NEXT: orr x9, x9, x13, lsl #63 -; GISEL-NEXT: stp x8, x11, [x0, #32] -; GISEL-NEXT: orr x8, x12, x16, lsl #63 -; GISEL-NEXT: stp x9, x8, [x0, #48] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x13, x14, [x1, #32] +; GISEL-NEXT: lsl x12, x8, #63 +; GISEL-NEXT: extr x8, x9, x8, #1 +; GISEL-NEXT: extr x9, x10, x9, #1 +; GISEL-NEXT: extr x10, x11, x10, #1 +; GISEL-NEXT: ldp x15, x16, [x1, #48] +; GISEL-NEXT: stp x12, x8, [x0] +; GISEL-NEXT: extr x8, x13, x11, #1 +; GISEL-NEXT: stp x9, x10, [x0, #16] +; GISEL-NEXT: extr x9, x14, x13, #1 +; GISEL-NEXT: extr x10, x15, x14, #1 +; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: extr x8, x16, x15, #1 +; GISEL-NEXT: stp x10, x8, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5791,30 +5697,22 @@ define void @test_lshr_i512_const_63(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_63: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x11, [x1] -; GISEL-NEXT: ldp x10, x14, [x1, #24] -; GISEL-NEXT: ldr x16, [x1, #56] -; GISEL-NEXT: lsl x12, x8, #1 -; GISEL-NEXT: lsl x13, x9, #1 -; GISEL-NEXT: lsl x15, x10, #1 -; GISEL-NEXT: orr x11, x12, x11, lsr #63 -; GISEL-NEXT: orr x8, x13, x8, lsr #63 -; GISEL-NEXT: lsl x13, x14, #1 -; GISEL-NEXT: orr x9, x15, x9, lsr #63 -; GISEL-NEXT: ldp x12, x15, [x1, #40] -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: orr x10, x13, x10, lsr #63 -; GISEL-NEXT: lsl x8, x16, #1 -; GISEL-NEXT: lsl x11, x12, #1 -; GISEL-NEXT: lsl x13, x15, #1 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x8, x8, x15, lsr #63 -; GISEL-NEXT: lsr x10, x16, #63 -; GISEL-NEXT: orr x11, x11, x14, lsr #63 -; GISEL-NEXT: orr x9, x13, x12, lsr #63 -; GISEL-NEXT: stp x8, x10, [x0, #48] -; GISEL-NEXT: stp x11, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: extr x8, x9, x8, #63 +; GISEL-NEXT: ldp x14, x15, [x1, #48] +; GISEL-NEXT: extr x9, x10, x9, #63 +; GISEL-NEXT: extr x10, x11, x10, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #63 +; GISEL-NEXT: extr x9, x13, x12, #63 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #63 +; GISEL-NEXT: extr x8, x15, x14, #63 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: lsr x9, x15, #63 +; GISEL-NEXT: stp x8, x9, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5846,30 +5744,22 @@ define void @test_ashr_i512_const_63(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_63: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #8] -; GISEL-NEXT: ldr x10, [x1] -; GISEL-NEXT: ldp x11, x13, [x1, #24] -; GISEL-NEXT: ldr x17, [x1, #56] -; GISEL-NEXT: lsl x15, x9, #1 -; GISEL-NEXT: lsl x12, x8, #1 -; GISEL-NEXT: lsl x16, x11, #1 -; GISEL-NEXT: orr x8, x15, x8, lsr #63 -; GISEL-NEXT: lsl x15, x13, #1 -; GISEL-NEXT: orr x10, x12, x10, lsr #63 -; GISEL-NEXT: ldp x14, x12, [x1, #40] -; GISEL-NEXT: orr x9, x16, x9, lsr #63 -; GISEL-NEXT: orr x11, x15, x11, lsr #63 -; GISEL-NEXT: stp x10, x8, [x0] -; GISEL-NEXT: lsl x8, x17, #1 -; GISEL-NEXT: lsl x16, x14, #1 -; GISEL-NEXT: lsl x10, x12, #1 -; GISEL-NEXT: stp x9, x11, [x0, #16] -; GISEL-NEXT: asr x9, x17, #63 -; GISEL-NEXT: orr x8, x8, x12, lsr #63 -; GISEL-NEXT: orr x13, x16, x13, lsr #63 -; GISEL-NEXT: orr x10, x10, x14, lsr #63 -; GISEL-NEXT: orr x9, x9, x9, lsl #1 -; GISEL-NEXT: stp x13, x10, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1] +; GISEL-NEXT: ldp x10, x11, [x1, #16] +; GISEL-NEXT: ldp x12, x13, [x1, #32] +; GISEL-NEXT: extr x8, x9, x8, #63 +; GISEL-NEXT: ldp x14, x15, [x1, #48] +; GISEL-NEXT: extr x9, x10, x9, #63 +; GISEL-NEXT: extr x10, x11, x10, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #63 +; GISEL-NEXT: extr x9, x13, x12, #63 +; GISEL-NEXT: extr x11, x14, x13, #63 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: asr x10, x15, #63 +; GISEL-NEXT: extr x8, x15, x14, #63 +; GISEL-NEXT: stp x9, x11, [x0, #32] +; GISEL-NEXT: orr x9, x10, x10, lsl #1 ; GISEL-NEXT: stp x8, x9, [x0, #48] ; GISEL-NEXT: ret entry: @@ -5906,23 +5796,17 @@ define void @test_shl_i512_const_65(ptr %result, ptr %input) { ; GISEL-NEXT: ldr x15, [x1, #48] ; GISEL-NEXT: ldp x10, x11, [x1, #16] ; GISEL-NEXT: ldp x12, x13, [x1, #32] -; GISEL-NEXT: lsr x14, x8, #63 -; GISEL-NEXT: lsr x16, x9, #63 -; GISEL-NEXT: lsl x8, x8, #1 -; GISEL-NEXT: orr x9, x14, x9, lsl #1 -; GISEL-NEXT: lsr x14, x10, #63 -; GISEL-NEXT: orr x10, x16, x10, lsl #1 -; GISEL-NEXT: stp xzr, x8, [x0] -; GISEL-NEXT: lsr x8, x11, #63 -; GISEL-NEXT: orr x11, x14, x11, lsl #1 -; GISEL-NEXT: lsr x14, x12, #63 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: lsr x9, x13, #63 -; GISEL-NEXT: orr x8, x8, x12, lsl #1 -; GISEL-NEXT: orr x10, x14, x13, lsl #1 -; GISEL-NEXT: orr x9, x9, x15, lsl #1 -; GISEL-NEXT: stp x11, x8, [x0, #32] -; GISEL-NEXT: stp x10, x9, [x0, #48] +; GISEL-NEXT: lsl x14, x8, #1 +; GISEL-NEXT: extr x8, x9, x8, #63 +; GISEL-NEXT: extr x9, x10, x9, #63 +; GISEL-NEXT: extr x10, x11, x10, #63 +; GISEL-NEXT: stp xzr, x14, [x0] +; GISEL-NEXT: stp x8, x9, [x0, #16] +; GISEL-NEXT: extr x8, x12, x11, #63 +; GISEL-NEXT: extr x9, x13, x12, #63 +; GISEL-NEXT: stp x10, x8, [x0, #32] +; GISEL-NEXT: extr x10, x15, x13, #63 +; GISEL-NEXT: stp x9, x10, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -5953,27 +5837,21 @@ define void @test_lshr_i512_const_65(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_65: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x10, [x1, #8] -; GISEL-NEXT: ldp x11, x14, [x1, #32] -; GISEL-NEXT: ldp x15, x16, [x1, #48] -; GISEL-NEXT: lsl x12, x8, #63 -; GISEL-NEXT: lsl x13, x9, #63 -; GISEL-NEXT: orr x10, x12, x10, lsr #1 -; GISEL-NEXT: lsl x12, x11, #63 -; GISEL-NEXT: orr x8, x13, x8, lsr #1 -; GISEL-NEXT: lsl x13, x14, #63 -; GISEL-NEXT: orr x9, x12, x9, lsr #1 -; GISEL-NEXT: stp x10, x8, [x0] -; GISEL-NEXT: lsl x10, x15, #63 -; GISEL-NEXT: orr x11, x13, x11, lsr #1 -; GISEL-NEXT: lsl x12, x16, #63 -; GISEL-NEXT: orr x8, x10, x14, lsr #1 -; GISEL-NEXT: lsr x10, x16, #1 -; GISEL-NEXT: stp x9, x11, [x0, #16] -; GISEL-NEXT: orr x9, x12, x15, lsr #1 -; GISEL-NEXT: stp x10, xzr, [x0, #48] -; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x14, [x1, #56] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x12, x13, [x1, #40] +; GISEL-NEXT: extr x8, x9, x8, #1 +; GISEL-NEXT: extr x9, x10, x9, #1 +; GISEL-NEXT: extr x10, x11, x10, #1 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #1 +; GISEL-NEXT: extr x9, x13, x12, #1 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #1 +; GISEL-NEXT: lsr x8, x14, #1 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, xzr, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6005,29 +5883,23 @@ define void @test_ashr_i512_const_65(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_65: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x11, [x1, #8] -; GISEL-NEXT: ldp x10, x13, [x1, #32] -; GISEL-NEXT: lsl x12, x8, #63 -; GISEL-NEXT: lsl x14, x9, #63 -; GISEL-NEXT: lsl x15, x10, #63 -; GISEL-NEXT: orr x11, x12, x11, lsr #1 -; GISEL-NEXT: ldp x12, x16, [x1, #48] -; GISEL-NEXT: orr x8, x14, x8, lsr #1 -; GISEL-NEXT: lsl x14, x13, #63 -; GISEL-NEXT: orr x9, x15, x9, lsr #1 -; GISEL-NEXT: asr x15, x16, #63 -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: lsl x11, x12, #63 -; GISEL-NEXT: orr x10, x14, x10, lsr #1 -; GISEL-NEXT: lsl x14, x16, #63 -; GISEL-NEXT: orr x8, x11, x13, lsr #1 +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x13, [x1, #40] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x14, x12, [x1, #48] +; GISEL-NEXT: extr x8, x9, x8, #1 +; GISEL-NEXT: extr x9, x10, x9, #1 +; GISEL-NEXT: extr x10, x11, x10, #1 +; GISEL-NEXT: asr x15, x12, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x13, x11, #1 +; GISEL-NEXT: extr x9, x14, x13, #1 ; GISEL-NEXT: lsl x11, x15, #63 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x9, x14, x12, lsr #1 -; GISEL-NEXT: orr x10, x11, x16, asr #1 -; GISEL-NEXT: stp x8, x9, [x0, #32] -; GISEL-NEXT: stp x10, x15, [x0, #48] +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x12, x14, #1 +; GISEL-NEXT: orr x8, x11, x12, asr #1 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, x15, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6062,23 +5934,17 @@ define void @test_shl_i512_const_100(ptr %result, ptr %input) { ; GISEL-NEXT: ldr x15, [x1, #48] ; GISEL-NEXT: ldp x10, x11, [x1, #16] ; GISEL-NEXT: ldp x12, x13, [x1, #32] -; GISEL-NEXT: lsr x14, x8, #28 -; GISEL-NEXT: lsr x16, x9, #28 -; GISEL-NEXT: lsl x8, x8, #36 -; GISEL-NEXT: orr x9, x14, x9, lsl #36 -; GISEL-NEXT: lsr x14, x10, #28 -; GISEL-NEXT: orr x10, x16, x10, lsl #36 -; GISEL-NEXT: stp xzr, x8, [x0] -; GISEL-NEXT: lsr x8, x11, #28 -; GISEL-NEXT: orr x11, x14, x11, lsl #36 -; GISEL-NEXT: lsr x14, x12, #28 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: lsr x9, x13, #28 -; GISEL-NEXT: orr x8, x8, x12, lsl #36 -; GISEL-NEXT: orr x10, x14, x13, lsl #36 -; GISEL-NEXT: orr x9, x9, x15, lsl #36 -; GISEL-NEXT: stp x11, x8, [x0, #32] -; GISEL-NEXT: stp x10, x9, [x0, #48] +; GISEL-NEXT: lsl x14, x8, #36 +; GISEL-NEXT: extr x8, x9, x8, #28 +; GISEL-NEXT: extr x9, x10, x9, #28 +; GISEL-NEXT: extr x10, x11, x10, #28 +; GISEL-NEXT: stp xzr, x14, [x0] +; GISEL-NEXT: stp x8, x9, [x0, #16] +; GISEL-NEXT: extr x8, x12, x11, #28 +; GISEL-NEXT: extr x9, x13, x12, #28 +; GISEL-NEXT: stp x10, x8, [x0, #32] +; GISEL-NEXT: extr x10, x15, x13, #28 +; GISEL-NEXT: stp x9, x10, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6109,27 +5975,21 @@ define void @test_lshr_i512_const_100(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_100: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x10, [x1, #8] -; GISEL-NEXT: ldp x11, x14, [x1, #32] -; GISEL-NEXT: ldp x15, x16, [x1, #48] -; GISEL-NEXT: lsl x12, x8, #28 -; GISEL-NEXT: lsl x13, x9, #28 -; GISEL-NEXT: orr x10, x12, x10, lsr #36 -; GISEL-NEXT: lsl x12, x11, #28 -; GISEL-NEXT: orr x8, x13, x8, lsr #36 -; GISEL-NEXT: lsl x13, x14, #28 -; GISEL-NEXT: orr x9, x12, x9, lsr #36 -; GISEL-NEXT: stp x10, x8, [x0] -; GISEL-NEXT: lsl x10, x15, #28 -; GISEL-NEXT: orr x11, x13, x11, lsr #36 -; GISEL-NEXT: lsl x12, x16, #28 -; GISEL-NEXT: orr x8, x10, x14, lsr #36 -; GISEL-NEXT: lsr x10, x16, #36 -; GISEL-NEXT: stp x9, x11, [x0, #16] -; GISEL-NEXT: orr x9, x12, x15, lsr #36 -; GISEL-NEXT: stp x10, xzr, [x0, #48] -; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x14, [x1, #56] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x12, x13, [x1, #40] +; GISEL-NEXT: extr x8, x9, x8, #36 +; GISEL-NEXT: extr x9, x10, x9, #36 +; GISEL-NEXT: extr x10, x11, x10, #36 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #36 +; GISEL-NEXT: extr x9, x13, x12, #36 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #36 +; GISEL-NEXT: lsr x8, x14, #36 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, xzr, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6161,29 +6021,23 @@ define void @test_ashr_i512_const_100(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_100: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x11, [x1, #8] -; GISEL-NEXT: ldp x10, x13, [x1, #32] -; GISEL-NEXT: lsl x12, x8, #28 -; GISEL-NEXT: lsl x14, x9, #28 -; GISEL-NEXT: lsl x15, x10, #28 -; GISEL-NEXT: orr x11, x12, x11, lsr #36 -; GISEL-NEXT: ldp x12, x16, [x1, #48] -; GISEL-NEXT: orr x8, x14, x8, lsr #36 -; GISEL-NEXT: lsl x14, x13, #28 -; GISEL-NEXT: orr x9, x15, x9, lsr #36 -; GISEL-NEXT: asr x15, x16, #63 -; GISEL-NEXT: stp x11, x8, [x0] -; GISEL-NEXT: lsl x11, x12, #28 -; GISEL-NEXT: orr x10, x14, x10, lsr #36 -; GISEL-NEXT: lsl x14, x16, #28 -; GISEL-NEXT: orr x8, x11, x13, lsr #36 +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x13, [x1, #40] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x14, x12, [x1, #48] +; GISEL-NEXT: extr x8, x9, x8, #36 +; GISEL-NEXT: extr x9, x10, x9, #36 +; GISEL-NEXT: extr x10, x11, x10, #36 +; GISEL-NEXT: asr x15, x12, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x13, x11, #36 +; GISEL-NEXT: extr x9, x14, x13, #36 ; GISEL-NEXT: lsl x11, x15, #28 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: orr x9, x14, x12, lsr #36 -; GISEL-NEXT: orr x10, x11, x16, asr #36 -; GISEL-NEXT: stp x8, x9, [x0, #32] -; GISEL-NEXT: stp x10, x15, [x0, #48] +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x12, x14, #36 +; GISEL-NEXT: orr x8, x11, x12, asr #36 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, x15, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6219,23 +6073,17 @@ define void @test_shl_i512_const_127(ptr %result, ptr %input) { ; GISEL-NEXT: ldr x15, [x1, #48] ; GISEL-NEXT: ldp x10, x11, [x1, #16] ; GISEL-NEXT: ldp x12, x13, [x1, #32] -; GISEL-NEXT: lsr x14, x8, #1 -; GISEL-NEXT: lsr x16, x9, #1 -; GISEL-NEXT: lsl x8, x8, #63 -; GISEL-NEXT: orr x9, x14, x9, lsl #63 -; GISEL-NEXT: lsr x14, x10, #1 -; GISEL-NEXT: orr x10, x16, x10, lsl #63 -; GISEL-NEXT: stp xzr, x8, [x0] -; GISEL-NEXT: lsr x8, x11, #1 -; GISEL-NEXT: orr x11, x14, x11, lsl #63 -; GISEL-NEXT: lsr x14, x12, #1 -; GISEL-NEXT: stp x9, x10, [x0, #16] -; GISEL-NEXT: lsr x9, x13, #1 -; GISEL-NEXT: orr x8, x8, x12, lsl #63 -; GISEL-NEXT: orr x10, x14, x13, lsl #63 -; GISEL-NEXT: orr x9, x9, x15, lsl #63 -; GISEL-NEXT: stp x11, x8, [x0, #32] -; GISEL-NEXT: stp x10, x9, [x0, #48] +; GISEL-NEXT: lsl x14, x8, #63 +; GISEL-NEXT: extr x8, x9, x8, #1 +; GISEL-NEXT: extr x9, x10, x9, #1 +; GISEL-NEXT: extr x10, x11, x10, #1 +; GISEL-NEXT: stp xzr, x14, [x0] +; GISEL-NEXT: stp x8, x9, [x0, #16] +; GISEL-NEXT: extr x8, x12, x11, #1 +; GISEL-NEXT: extr x9, x13, x12, #1 +; GISEL-NEXT: stp x10, x8, [x0, #32] +; GISEL-NEXT: extr x10, x15, x13, #1 +; GISEL-NEXT: stp x9, x10, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6266,27 +6114,21 @@ define void @test_lshr_i512_const_127(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_lshr_i512_const_127: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x10, [x1, #8] -; GISEL-NEXT: ldp x11, x14, [x1, #32] -; GISEL-NEXT: ldp x15, x16, [x1, #48] -; GISEL-NEXT: lsl x12, x8, #1 -; GISEL-NEXT: lsl x13, x9, #1 -; GISEL-NEXT: orr x10, x12, x10, lsr #63 -; GISEL-NEXT: lsl x12, x11, #1 -; GISEL-NEXT: orr x8, x13, x8, lsr #63 -; GISEL-NEXT: lsl x13, x14, #1 -; GISEL-NEXT: orr x9, x12, x9, lsr #63 -; GISEL-NEXT: stp x10, x8, [x0] -; GISEL-NEXT: lsl x10, x15, #1 -; GISEL-NEXT: orr x11, x13, x11, lsr #63 -; GISEL-NEXT: lsl x12, x16, #1 -; GISEL-NEXT: orr x8, x10, x14, lsr #63 -; GISEL-NEXT: lsr x10, x16, #63 -; GISEL-NEXT: stp x9, x11, [x0, #16] -; GISEL-NEXT: orr x9, x12, x15, lsr #63 -; GISEL-NEXT: stp x10, xzr, [x0, #48] -; GISEL-NEXT: stp x8, x9, [x0, #32] +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x14, [x1, #56] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x12, x13, [x1, #40] +; GISEL-NEXT: extr x8, x9, x8, #63 +; GISEL-NEXT: extr x9, x10, x9, #63 +; GISEL-NEXT: extr x10, x11, x10, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #63 +; GISEL-NEXT: extr x9, x13, x12, #63 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #63 +; GISEL-NEXT: lsr x8, x14, #63 +; GISEL-NEXT: stp x9, x10, [x0, #32] +; GISEL-NEXT: stp x8, xzr, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 @@ -6317,28 +6159,22 @@ define void @test_ashr_i512_const_127(ptr %result, ptr %input) { ; ; GISEL-LABEL: test_ashr_i512_const_127: ; GISEL: ; %bb.0: ; %entry -; GISEL-NEXT: ldp x8, x9, [x1, #16] -; GISEL-NEXT: ldr x10, [x1, #8] -; GISEL-NEXT: ldp x11, x14, [x1, #32] -; GISEL-NEXT: ldp x15, x16, [x1, #48] -; GISEL-NEXT: lsl x12, x8, #1 -; GISEL-NEXT: lsl x13, x9, #1 -; GISEL-NEXT: orr x10, x12, x10, lsr #63 -; GISEL-NEXT: lsl x12, x11, #1 -; GISEL-NEXT: orr x8, x13, x8, lsr #63 -; GISEL-NEXT: lsl x13, x14, #1 -; GISEL-NEXT: orr x9, x12, x9, lsr #63 -; GISEL-NEXT: lsl x12, x15, #1 -; GISEL-NEXT: stp x10, x8, [x0] -; GISEL-NEXT: lsl x10, x16, #1 -; GISEL-NEXT: orr x11, x13, x11, lsr #63 -; GISEL-NEXT: asr x8, x16, #63 -; GISEL-NEXT: orr x12, x12, x14, lsr #63 -; GISEL-NEXT: stp x9, x11, [x0, #16] -; GISEL-NEXT: orr x9, x10, x15, lsr #63 -; GISEL-NEXT: orr x10, x8, x8, lsl #1 -; GISEL-NEXT: stp x12, x9, [x0, #32] -; GISEL-NEXT: stp x10, x8, [x0, #48] +; GISEL-NEXT: ldp x8, x9, [x1, #8] +; GISEL-NEXT: ldr x14, [x1, #56] +; GISEL-NEXT: ldp x10, x11, [x1, #24] +; GISEL-NEXT: ldp x12, x13, [x1, #40] +; GISEL-NEXT: extr x8, x9, x8, #63 +; GISEL-NEXT: extr x9, x10, x9, #63 +; GISEL-NEXT: extr x10, x11, x10, #63 +; GISEL-NEXT: stp x8, x9, [x0] +; GISEL-NEXT: extr x8, x12, x11, #63 +; GISEL-NEXT: asr x9, x14, #63 +; GISEL-NEXT: extr x11, x13, x12, #63 +; GISEL-NEXT: stp x10, x8, [x0, #16] +; GISEL-NEXT: extr x10, x14, x13, #63 +; GISEL-NEXT: orr x8, x9, x9, lsl #1 +; GISEL-NEXT: stp x11, x10, [x0, #32] +; GISEL-NEXT: stp x8, x9, [x0, #48] ; GISEL-NEXT: ret entry: %input_val = load i512, ptr %input, align 64 diff --git a/llvm/test/CodeGen/AArch64/adc.ll b/llvm/test/CodeGen/AArch64/adc.ll index 12e8bf2..03f3cf1 100644 --- a/llvm/test/CodeGen/AArch64/adc.ll +++ b/llvm/test/CodeGen/AArch64/adc.ll @@ -71,9 +71,8 @@ define i128 @test_shifted(i128 %a, i128 %b) { ; ; CHECK-GI-LABEL: test_shifted: ; CHECK-GI: ; %bb.0: -; CHECK-GI-NEXT: lsr x8, x2, #19 +; CHECK-GI-NEXT: extr x8, x3, x2, #19 ; CHECK-GI-NEXT: adds x0, x0, x2, lsl #45 -; CHECK-GI-NEXT: orr x8, x8, x3, lsl #45 ; CHECK-GI-NEXT: adc x1, x1, x8 ; CHECK-GI-NEXT: ret %rhs = shl i128 %b, 45 @@ -108,8 +107,7 @@ define i128 @test_extended(i128 %a, i16 %b) { ; CHECK-GI-NEXT: sxth x8, w2 ; CHECK-GI-NEXT: adds x0, x0, w2, sxth #3 ; CHECK-GI-NEXT: asr x9, x8, #63 -; CHECK-GI-NEXT: lsr x8, x8, #61 -; CHECK-GI-NEXT: orr x8, x8, x9, lsl #3 +; CHECK-GI-NEXT: extr x8, x9, x8, #61 ; CHECK-GI-NEXT: adc x1, x1, x8 ; CHECK-GI-NEXT: ret %ext = sext i16 %b to i128 diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll index 765f6b7..7f07ef4 100644 --- a/llvm/test/CodeGen/AArch64/fsh.ll +++ b/llvm/test/CodeGen/AArch64/fsh.ll @@ -510,41 +510,40 @@ define i128 @fshl_i128(i128 %a, i128 %b, i128 %c) { ; ; CHECK-GI-LABEL: fshl_i128: ; CHECK-GI: // %bb.0: // %entry +; CHECK-GI-NEXT: mov w8, #64 // =0x40 ; CHECK-GI-NEXT: and x9, x4, #0x7f -; CHECK-GI-NEXT: mov w10, #64 // =0x40 -; CHECK-GI-NEXT: lsl x14, x3, #63 -; CHECK-GI-NEXT: sub x12, x10, x9 +; CHECK-GI-NEXT: mov w10, #127 // =0x7f +; CHECK-GI-NEXT: sub x12, x8, x9 ; CHECK-GI-NEXT: lsl x13, x1, x9 -; CHECK-GI-NEXT: mov w8, #127 // =0x7f +; CHECK-GI-NEXT: bic x10, x10, x4 ; CHECK-GI-NEXT: lsr x12, x0, x12 -; CHECK-GI-NEXT: bic x8, x8, x4 -; CHECK-GI-NEXT: sub x15, x9, #64 +; CHECK-GI-NEXT: sub x14, x9, #64 +; CHECK-GI-NEXT: lsl x15, x0, x9 +; CHECK-GI-NEXT: extr x16, x3, x2, #1 ; CHECK-GI-NEXT: cmp x9, #64 -; CHECK-GI-NEXT: lsl x9, x0, x9 -; CHECK-GI-NEXT: lsl x15, x0, x15 -; CHECK-GI-NEXT: orr x12, x12, x13 -; CHECK-GI-NEXT: orr x13, x14, x2, lsr #1 -; CHECK-GI-NEXT: lsr x14, x3, #1 -; CHECK-GI-NEXT: sub x10, x10, x8 -; CHECK-GI-NEXT: sub x16, x8, #64 -; CHECK-GI-NEXT: csel x9, x9, xzr, lo -; CHECK-GI-NEXT: lsr x17, x13, x8 -; CHECK-GI-NEXT: lsl x10, x14, x10 -; CHECK-GI-NEXT: csel x12, x12, x15, lo +; CHECK-GI-NEXT: sub x8, x8, x10 +; CHECK-GI-NEXT: orr x9, x12, x13 +; CHECK-GI-NEXT: lsr x12, x3, #1 +; CHECK-GI-NEXT: lsl x13, x0, x14 +; CHECK-GI-NEXT: csel x14, x15, xzr, lo +; CHECK-GI-NEXT: sub x15, x10, #64 +; CHECK-GI-NEXT: lsr x17, x16, x10 +; CHECK-GI-NEXT: lsl x8, x12, x8 +; CHECK-GI-NEXT: csel x9, x9, x13, lo ; CHECK-GI-NEXT: tst x4, #0x7f -; CHECK-GI-NEXT: lsr x15, x14, x16 +; CHECK-GI-NEXT: lsr x13, x12, x15 ; CHECK-GI-NEXT: mvn x11, x4 -; CHECK-GI-NEXT: csel x12, x1, x12, eq -; CHECK-GI-NEXT: orr x10, x17, x10 -; CHECK-GI-NEXT: cmp x8, #64 -; CHECK-GI-NEXT: lsr x14, x14, x8 -; CHECK-GI-NEXT: csel x10, x10, x15, lo +; CHECK-GI-NEXT: csel x9, x1, x9, eq +; CHECK-GI-NEXT: orr x8, x17, x8 +; CHECK-GI-NEXT: cmp x10, #64 +; CHECK-GI-NEXT: lsr x12, x12, x10 +; CHECK-GI-NEXT: csel x8, x8, x13, lo ; CHECK-GI-NEXT: tst x11, #0x7f -; CHECK-GI-NEXT: csel x10, x13, x10, eq -; CHECK-GI-NEXT: cmp x8, #64 -; CHECK-GI-NEXT: csel x8, x14, xzr, lo -; CHECK-GI-NEXT: orr x0, x9, x10 -; CHECK-GI-NEXT: orr x1, x12, x8 +; CHECK-GI-NEXT: csel x8, x16, x8, eq +; CHECK-GI-NEXT: cmp x10, #64 +; CHECK-GI-NEXT: csel x10, x12, xzr, lo +; CHECK-GI-NEXT: orr x0, x14, x8 +; CHECK-GI-NEXT: orr x1, x9, x10 ; CHECK-GI-NEXT: ret entry: %d = call i128 @llvm.fshl(i128 %a, i128 %b, i128 %c) @@ -571,41 +570,40 @@ define i128 @fshr_i128(i128 %a, i128 %b, i128 %c) { ; ; CHECK-GI-LABEL: fshr_i128: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsr x8, x0, #63 -; CHECK-GI-NEXT: mov w9, #127 // =0x7f -; CHECK-GI-NEXT: mov w10, #64 // =0x40 -; CHECK-GI-NEXT: bic x9, x9, x4 -; CHECK-GI-NEXT: lsl x11, x0, #1 -; CHECK-GI-NEXT: and x12, x4, #0x7f -; CHECK-GI-NEXT: orr x8, x8, x1, lsl #1 -; CHECK-GI-NEXT: sub x14, x10, x9 -; CHECK-GI-NEXT: sub x17, x9, #64 -; CHECK-GI-NEXT: lsl x15, x11, x9 -; CHECK-GI-NEXT: lsr x14, x11, x14 -; CHECK-GI-NEXT: cmp x9, #64 -; CHECK-GI-NEXT: lsl x16, x8, x9 -; CHECK-GI-NEXT: sub x9, x10, x12 -; CHECK-GI-NEXT: lsl x10, x11, x17 -; CHECK-GI-NEXT: mvn x13, x4 -; CHECK-GI-NEXT: csel x11, x15, xzr, lo -; CHECK-GI-NEXT: sub x15, x12, #64 -; CHECK-GI-NEXT: orr x14, x14, x16 -; CHECK-GI-NEXT: lsr x16, x2, x12 -; CHECK-GI-NEXT: lsl x9, x3, x9 -; CHECK-GI-NEXT: csel x10, x14, x10, lo -; CHECK-GI-NEXT: tst x13, #0x7f -; CHECK-GI-NEXT: lsr x13, x3, x15 -; CHECK-GI-NEXT: csel x8, x8, x10, eq -; CHECK-GI-NEXT: orr x9, x16, x9 -; CHECK-GI-NEXT: cmp x12, #64 -; CHECK-GI-NEXT: lsr x10, x3, x12 -; CHECK-GI-NEXT: csel x9, x9, x13, lo +; CHECK-GI-NEXT: mov w8, #127 // =0x7f +; CHECK-GI-NEXT: lsl x9, x0, #1 +; CHECK-GI-NEXT: extr x10, x1, x0, #63 +; CHECK-GI-NEXT: bic x8, x8, x4 +; CHECK-GI-NEXT: mov w11, #64 // =0x40 +; CHECK-GI-NEXT: and x14, x4, #0x7f +; CHECK-GI-NEXT: sub x12, x11, x8 +; CHECK-GI-NEXT: lsl x13, x10, x8 +; CHECK-GI-NEXT: lsl x16, x9, x8 +; CHECK-GI-NEXT: lsr x12, x9, x12 +; CHECK-GI-NEXT: sub x17, x8, #64 +; CHECK-GI-NEXT: cmp x8, #64 +; CHECK-GI-NEXT: lsl x8, x9, x17 +; CHECK-GI-NEXT: sub x11, x11, x14 +; CHECK-GI-NEXT: mvn x15, x4 +; CHECK-GI-NEXT: orr x12, x12, x13 +; CHECK-GI-NEXT: csel x9, x16, xzr, lo +; CHECK-GI-NEXT: sub x13, x14, #64 +; CHECK-GI-NEXT: lsr x16, x2, x14 +; CHECK-GI-NEXT: lsl x11, x3, x11 +; CHECK-GI-NEXT: csel x8, x12, x8, lo +; CHECK-GI-NEXT: tst x15, #0x7f +; CHECK-GI-NEXT: lsr x12, x3, x13 +; CHECK-GI-NEXT: csel x8, x10, x8, eq +; CHECK-GI-NEXT: orr x10, x16, x11 +; CHECK-GI-NEXT: cmp x14, #64 +; CHECK-GI-NEXT: lsr x11, x3, x14 +; CHECK-GI-NEXT: csel x10, x10, x12, lo ; CHECK-GI-NEXT: tst x4, #0x7f -; CHECK-GI-NEXT: csel x9, x2, x9, eq -; CHECK-GI-NEXT: cmp x12, #64 -; CHECK-GI-NEXT: csel x10, x10, xzr, lo -; CHECK-GI-NEXT: orr x0, x11, x9 -; CHECK-GI-NEXT: orr x1, x8, x10 +; CHECK-GI-NEXT: csel x10, x2, x10, eq +; CHECK-GI-NEXT: cmp x14, #64 +; CHECK-GI-NEXT: csel x11, x11, xzr, lo +; CHECK-GI-NEXT: orr x0, x9, x10 +; CHECK-GI-NEXT: orr x1, x8, x11 ; CHECK-GI-NEXT: ret entry: %d = call i128 @llvm.fshr(i128 %a, i128 %b, i128 %c) @@ -720,10 +718,9 @@ define i128 @rotl_i128_c(i128 %a) { ; ; CHECK-GI-LABEL: rotl_i128_c: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsr x8, x0, #61 -; CHECK-GI-NEXT: lsr x9, x1, #61 -; CHECK-GI-NEXT: orr x1, x8, x1, lsl #3 -; CHECK-GI-NEXT: orr x0, x9, x0, lsl #3 +; CHECK-GI-NEXT: extr x8, x1, x0, #61 +; CHECK-GI-NEXT: extr x0, x0, x1, #61 +; CHECK-GI-NEXT: mov x1, x8 ; CHECK-GI-NEXT: ret entry: %d = call i128 @llvm.fshl(i128 %a, i128 %a, i128 3) @@ -731,20 +728,12 @@ entry: } define i128 @rotr_i128_c(i128 %a) { -; CHECK-SD-LABEL: rotr_i128_c: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: extr x8, x1, x0, #3 -; CHECK-SD-NEXT: extr x1, x0, x1, #3 -; CHECK-SD-NEXT: mov x0, x8 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: rotr_i128_c: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsl x8, x1, #61 -; CHECK-GI-NEXT: lsl x9, x0, #61 -; CHECK-GI-NEXT: orr x0, x8, x0, lsr #3 -; CHECK-GI-NEXT: orr x1, x9, x1, lsr #3 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: rotr_i128_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: extr x8, x1, x0, #3 +; CHECK-NEXT: extr x1, x0, x1, #3 +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: ret entry: %d = call i128 @llvm.fshr(i128 %a, i128 %a, i128 3) ret i128 %d @@ -868,10 +857,8 @@ define i128 @fshl_i128_c(i128 %a, i128 %b) { ; ; CHECK-GI-LABEL: fshl_i128_c: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsr x8, x0, #61 -; CHECK-GI-NEXT: lsr x9, x3, #61 -; CHECK-GI-NEXT: orr x1, x8, x1, lsl #3 -; CHECK-GI-NEXT: orr x0, x9, x0, lsl #3 +; CHECK-GI-NEXT: extr x1, x1, x0, #61 +; CHECK-GI-NEXT: extr x0, x0, x3, #61 ; CHECK-GI-NEXT: ret entry: %d = call i128 @llvm.fshl(i128 %a, i128 %b, i128 3) @@ -879,21 +866,12 @@ entry: } define i128 @fshr_i128_c(i128 %a, i128 %b) { -; CHECK-SD-LABEL: fshr_i128_c: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: extr x8, x3, x2, #3 -; CHECK-SD-NEXT: extr x1, x0, x3, #3 -; CHECK-SD-NEXT: mov x0, x8 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: fshr_i128_c: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsl x8, x3, #61 -; CHECK-GI-NEXT: lsr x9, x3, #3 -; CHECK-GI-NEXT: orr x8, x8, x2, lsr #3 -; CHECK-GI-NEXT: orr x1, x9, x0, lsl #61 -; CHECK-GI-NEXT: mov x0, x8 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: fshr_i128_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: extr x8, x3, x2, #3 +; CHECK-NEXT: extr x1, x0, x3, #3 +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: ret entry: %d = call i128 @llvm.fshr(i128 %a, i128 %b, i128 3) ret i128 %d @@ -3013,75 +2991,73 @@ define <2 x i128> @fshl_v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) { ; CHECK-GI-NEXT: .cfi_def_cfa_offset 16 ; CHECK-GI-NEXT: .cfi_offset w19, -16 ; CHECK-GI-NEXT: ldr x11, [sp, #16] -; CHECK-GI-NEXT: mov w10, #64 // =0x40 +; CHECK-GI-NEXT: mov w9, #64 // =0x40 ; CHECK-GI-NEXT: ldr x12, [sp, #32] ; CHECK-GI-NEXT: mov w13, #127 // =0x7f -; CHECK-GI-NEXT: and x9, x11, #0x7f +; CHECK-GI-NEXT: and x8, x11, #0x7f ; CHECK-GI-NEXT: and x14, x12, #0x7f -; CHECK-GI-NEXT: mvn x15, x11 -; CHECK-GI-NEXT: sub x8, x10, x9 -; CHECK-GI-NEXT: sub x16, x9, #64 -; CHECK-GI-NEXT: lsl x19, x1, x9 -; CHECK-GI-NEXT: lsr x18, x0, x8 -; CHECK-GI-NEXT: lsl x17, x0, x9 -; CHECK-GI-NEXT: lsl x16, x0, x16 -; CHECK-GI-NEXT: cmp x9, #64 -; CHECK-GI-NEXT: bic x0, x13, x11 -; CHECK-GI-NEXT: mvn x8, x12 -; CHECK-GI-NEXT: orr x18, x18, x19 -; CHECK-GI-NEXT: csel x9, x17, xzr, lo +; CHECK-GI-NEXT: mvn x18, x11 +; CHECK-GI-NEXT: sub x10, x9, x8 +; CHECK-GI-NEXT: sub x15, x8, #64 +; CHECK-GI-NEXT: lsl x17, x1, x8 +; CHECK-GI-NEXT: lsr x16, x0, x10 +; CHECK-GI-NEXT: lsl x15, x0, x15 +; CHECK-GI-NEXT: cmp x8, #64 +; CHECK-GI-NEXT: lsl x19, x0, x8 +; CHECK-GI-NEXT: lsl x0, x3, x14 +; CHECK-GI-NEXT: mvn x10, x12 +; CHECK-GI-NEXT: orr x16, x16, x17 ; CHECK-GI-NEXT: sub x17, x14, #64 -; CHECK-GI-NEXT: csel x16, x18, x16, lo +; CHECK-GI-NEXT: csel x15, x16, x15, lo +; CHECK-GI-NEXT: sub x16, x9, x14 +; CHECK-GI-NEXT: csel x8, x19, xzr, lo +; CHECK-GI-NEXT: lsr x16, x2, x16 ; CHECK-GI-NEXT: tst x11, #0x7f -; CHECK-GI-NEXT: sub x11, x10, x14 -; CHECK-GI-NEXT: lsr x11, x2, x11 -; CHECK-GI-NEXT: lsl x18, x3, x14 -; CHECK-GI-NEXT: csel x16, x1, x16, eq -; CHECK-GI-NEXT: lsl x1, x2, x14 +; CHECK-GI-NEXT: lsl x19, x2, x14 ; CHECK-GI-NEXT: lsl x17, x2, x17 +; CHECK-GI-NEXT: csel x15, x1, x15, eq ; CHECK-GI-NEXT: cmp x14, #64 -; CHECK-GI-NEXT: lsl x14, x5, #63 -; CHECK-GI-NEXT: orr x11, x11, x18 -; CHECK-GI-NEXT: bic x13, x13, x12 -; CHECK-GI-NEXT: csel x18, x1, xzr, lo -; CHECK-GI-NEXT: csel x11, x11, x17, lo +; CHECK-GI-NEXT: orr x16, x16, x0 +; CHECK-GI-NEXT: bic x11, x13, x11 +; CHECK-GI-NEXT: csel x14, x19, xzr, lo +; CHECK-GI-NEXT: csel x16, x16, x17, lo ; CHECK-GI-NEXT: tst x12, #0x7f -; CHECK-GI-NEXT: lsr x12, x5, #1 -; CHECK-GI-NEXT: orr x14, x14, x4, lsr #1 -; CHECK-GI-NEXT: lsl x17, x7, #63 -; CHECK-GI-NEXT: sub x1, x10, x0 -; CHECK-GI-NEXT: csel x11, x3, x11, eq -; CHECK-GI-NEXT: sub x2, x0, #64 -; CHECK-GI-NEXT: lsr x3, x14, x0 -; CHECK-GI-NEXT: lsl x1, x12, x1 -; CHECK-GI-NEXT: lsr x4, x7, #1 -; CHECK-GI-NEXT: orr x17, x17, x6, lsr #1 -; CHECK-GI-NEXT: lsr x2, x12, x2 -; CHECK-GI-NEXT: cmp x0, #64 -; CHECK-GI-NEXT: orr x1, x3, x1 -; CHECK-GI-NEXT: sub x10, x10, x13 -; CHECK-GI-NEXT: lsr x12, x12, x0 -; CHECK-GI-NEXT: csel x1, x1, x2, lo -; CHECK-GI-NEXT: tst x15, #0x7f -; CHECK-GI-NEXT: sub x15, x13, #64 -; CHECK-GI-NEXT: lsr x2, x17, x13 -; CHECK-GI-NEXT: lsl x10, x4, x10 -; CHECK-GI-NEXT: csel x14, x14, x1, eq -; CHECK-GI-NEXT: cmp x0, #64 -; CHECK-GI-NEXT: lsr x15, x4, x15 -; CHECK-GI-NEXT: lsr x0, x4, x13 -; CHECK-GI-NEXT: csel x12, x12, xzr, lo -; CHECK-GI-NEXT: orr x10, x2, x10 -; CHECK-GI-NEXT: cmp x13, #64 -; CHECK-GI-NEXT: csel x10, x10, x15, lo -; CHECK-GI-NEXT: tst x8, #0x7f -; CHECK-GI-NEXT: orr x1, x16, x12 -; CHECK-GI-NEXT: csel x8, x17, x10, eq -; CHECK-GI-NEXT: cmp x13, #64 -; CHECK-GI-NEXT: csel x10, x0, xzr, lo -; CHECK-GI-NEXT: orr x0, x9, x14 -; CHECK-GI-NEXT: orr x2, x18, x8 -; CHECK-GI-NEXT: orr x3, x11, x10 +; CHECK-GI-NEXT: lsr x17, x5, #1 +; CHECK-GI-NEXT: extr x0, x5, x4, #1 +; CHECK-GI-NEXT: bic x12, x13, x12 +; CHECK-GI-NEXT: csel x13, x3, x16, eq +; CHECK-GI-NEXT: sub x16, x9, x11 +; CHECK-GI-NEXT: sub x1, x11, #64 +; CHECK-GI-NEXT: lsr x3, x7, #1 +; CHECK-GI-NEXT: lsr x2, x0, x11 +; CHECK-GI-NEXT: lsl x16, x17, x16 +; CHECK-GI-NEXT: extr x4, x7, x6, #1 +; CHECK-GI-NEXT: lsr x1, x17, x1 +; CHECK-GI-NEXT: cmp x11, #64 +; CHECK-GI-NEXT: sub x9, x9, x12 +; CHECK-GI-NEXT: orr x16, x2, x16 +; CHECK-GI-NEXT: lsr x17, x17, x11 +; CHECK-GI-NEXT: lsl x9, x3, x9 +; CHECK-GI-NEXT: csel x16, x16, x1, lo +; CHECK-GI-NEXT: tst x18, #0x7f +; CHECK-GI-NEXT: sub x18, x12, #64 +; CHECK-GI-NEXT: lsr x1, x4, x12 +; CHECK-GI-NEXT: csel x16, x0, x16, eq +; CHECK-GI-NEXT: cmp x11, #64 +; CHECK-GI-NEXT: lsr x11, x3, x18 +; CHECK-GI-NEXT: csel x17, x17, xzr, lo +; CHECK-GI-NEXT: cmp x12, #64 +; CHECK-GI-NEXT: orr x9, x1, x9 +; CHECK-GI-NEXT: lsr x18, x3, x12 +; CHECK-GI-NEXT: orr x0, x8, x16 +; CHECK-GI-NEXT: csel x9, x9, x11, lo +; CHECK-GI-NEXT: tst x10, #0x7f +; CHECK-GI-NEXT: orr x1, x15, x17 +; CHECK-GI-NEXT: csel x9, x4, x9, eq +; CHECK-GI-NEXT: cmp x12, #64 +; CHECK-GI-NEXT: csel x10, x18, xzr, lo +; CHECK-GI-NEXT: orr x2, x14, x9 +; CHECK-GI-NEXT: orr x3, x13, x10 ; CHECK-GI-NEXT: ldr x19, [sp], #16 // 8-byte Folded Reload ; CHECK-GI-NEXT: ret entry: @@ -3125,75 +3101,73 @@ define <2 x i128> @fshr_v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) { ; CHECK-GI-LABEL: fshr_v2i128: ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: ldr x9, [sp] -; CHECK-GI-NEXT: lsl x12, x1, #1 -; CHECK-GI-NEXT: mov w11, #127 // =0x7f -; CHECK-GI-NEXT: mov w14, #64 // =0x40 -; CHECK-GI-NEXT: lsl x15, x0, #1 +; CHECK-GI-NEXT: mov w10, #127 // =0x7f +; CHECK-GI-NEXT: mov w12, #64 // =0x40 +; CHECK-GI-NEXT: lsl x13, x0, #1 +; CHECK-GI-NEXT: extr x14, x1, x0, #63 ; CHECK-GI-NEXT: ldr x8, [sp, #16] -; CHECK-GI-NEXT: bic x13, x11, x9 -; CHECK-GI-NEXT: orr x12, x12, x0, lsr #63 -; CHECK-GI-NEXT: lsl x1, x3, #1 -; CHECK-GI-NEXT: sub x17, x14, x13 -; CHECK-GI-NEXT: sub x18, x13, #64 -; CHECK-GI-NEXT: lsl x3, x15, x13 -; CHECK-GI-NEXT: lsr x17, x15, x17 -; CHECK-GI-NEXT: lsl x0, x12, x13 -; CHECK-GI-NEXT: lsl x15, x15, x18 -; CHECK-GI-NEXT: bic x11, x11, x8 +; CHECK-GI-NEXT: bic x11, x10, x9 +; CHECK-GI-NEXT: mvn x16, x9 +; CHECK-GI-NEXT: and x15, x9, #0x7f +; CHECK-GI-NEXT: sub x17, x12, x11 +; CHECK-GI-NEXT: sub x18, x11, #64 +; CHECK-GI-NEXT: lsl x0, x14, x11 +; CHECK-GI-NEXT: lsr x17, x13, x17 +; CHECK-GI-NEXT: lsl x1, x13, x11 +; CHECK-GI-NEXT: lsl x13, x13, x18 +; CHECK-GI-NEXT: bic x10, x10, x8 ; CHECK-GI-NEXT: lsl x18, x2, #1 -; CHECK-GI-NEXT: cmp x13, #64 +; CHECK-GI-NEXT: cmp x11, #64 ; CHECK-GI-NEXT: orr x17, x17, x0 -; CHECK-GI-NEXT: orr x13, x1, x2, lsr #63 -; CHECK-GI-NEXT: mvn x16, x9 -; CHECK-GI-NEXT: csel x15, x17, x15, lo -; CHECK-GI-NEXT: sub x17, x14, x11 -; CHECK-GI-NEXT: csel x0, x3, xzr, lo +; CHECK-GI-NEXT: extr x11, x3, x2, #63 +; CHECK-GI-NEXT: csel x0, x1, xzr, lo +; CHECK-GI-NEXT: csel x13, x17, x13, lo +; CHECK-GI-NEXT: sub x17, x12, x10 ; CHECK-GI-NEXT: tst x16, #0x7f -; CHECK-GI-NEXT: sub x16, x11, #64 +; CHECK-GI-NEXT: sub x16, x10, #64 ; CHECK-GI-NEXT: lsr x17, x18, x17 -; CHECK-GI-NEXT: lsl x2, x13, x11 -; CHECK-GI-NEXT: lsl x1, x18, x11 -; CHECK-GI-NEXT: csel x12, x12, x15, eq -; CHECK-GI-NEXT: lsl x15, x18, x16 -; CHECK-GI-NEXT: and x10, x9, #0x7f -; CHECK-GI-NEXT: cmp x11, #64 -; CHECK-GI-NEXT: mvn x11, x8 +; CHECK-GI-NEXT: lsl x2, x11, x10 +; CHECK-GI-NEXT: lsl x1, x18, x10 +; CHECK-GI-NEXT: csel x13, x14, x13, eq +; CHECK-GI-NEXT: lsl x14, x18, x16 +; CHECK-GI-NEXT: cmp x10, #64 +; CHECK-GI-NEXT: mvn x10, x8 ; CHECK-GI-NEXT: orr x16, x17, x2 ; CHECK-GI-NEXT: csel x17, x1, xzr, lo -; CHECK-GI-NEXT: csel x15, x16, x15, lo -; CHECK-GI-NEXT: tst x11, #0x7f -; CHECK-GI-NEXT: sub x11, x14, x10 -; CHECK-GI-NEXT: sub x16, x10, #64 -; CHECK-GI-NEXT: lsr x18, x4, x10 -; CHECK-GI-NEXT: lsl x11, x5, x11 -; CHECK-GI-NEXT: csel x13, x13, x15, eq -; CHECK-GI-NEXT: lsr x15, x5, x16 +; CHECK-GI-NEXT: csel x14, x16, x14, lo +; CHECK-GI-NEXT: tst x10, #0x7f +; CHECK-GI-NEXT: sub x10, x12, x15 +; CHECK-GI-NEXT: sub x16, x15, #64 +; CHECK-GI-NEXT: lsr x18, x4, x15 +; CHECK-GI-NEXT: lsl x10, x5, x10 +; CHECK-GI-NEXT: csel x11, x11, x14, eq +; CHECK-GI-NEXT: lsr x14, x5, x16 ; CHECK-GI-NEXT: and x1, x8, #0x7f -; CHECK-GI-NEXT: orr x11, x18, x11 -; CHECK-GI-NEXT: cmp x10, #64 -; CHECK-GI-NEXT: lsr x16, x5, x10 -; CHECK-GI-NEXT: csel x11, x11, x15, lo +; CHECK-GI-NEXT: cmp x15, #64 +; CHECK-GI-NEXT: lsr x16, x5, x15 +; CHECK-GI-NEXT: orr x10, x18, x10 +; CHECK-GI-NEXT: csel x10, x10, x14, lo ; CHECK-GI-NEXT: tst x9, #0x7f -; CHECK-GI-NEXT: sub x9, x14, x1 -; CHECK-GI-NEXT: sub x14, x1, #64 -; CHECK-GI-NEXT: lsr x15, x6, x1 +; CHECK-GI-NEXT: sub x9, x12, x1 +; CHECK-GI-NEXT: sub x12, x1, #64 +; CHECK-GI-NEXT: lsr x14, x6, x1 ; CHECK-GI-NEXT: lsl x9, x7, x9 -; CHECK-GI-NEXT: csel x11, x4, x11, eq -; CHECK-GI-NEXT: cmp x10, #64 -; CHECK-GI-NEXT: lsr x10, x7, x14 -; CHECK-GI-NEXT: csel x14, x16, xzr, lo -; CHECK-GI-NEXT: orr x9, x15, x9 +; CHECK-GI-NEXT: csel x10, x4, x10, eq +; CHECK-GI-NEXT: cmp x15, #64 +; CHECK-GI-NEXT: lsr x12, x7, x12 +; CHECK-GI-NEXT: csel x15, x16, xzr, lo +; CHECK-GI-NEXT: orr x9, x14, x9 ; CHECK-GI-NEXT: cmp x1, #64 -; CHECK-GI-NEXT: lsr x15, x7, x1 -; CHECK-GI-NEXT: csel x9, x9, x10, lo +; CHECK-GI-NEXT: lsr x14, x7, x1 +; CHECK-GI-NEXT: csel x9, x9, x12, lo ; CHECK-GI-NEXT: tst x8, #0x7f ; CHECK-GI-NEXT: csel x8, x6, x9, eq ; CHECK-GI-NEXT: cmp x1, #64 -; CHECK-GI-NEXT: orr x0, x0, x11 -; CHECK-GI-NEXT: csel x9, x15, xzr, lo -; CHECK-GI-NEXT: orr x1, x12, x14 +; CHECK-GI-NEXT: orr x0, x0, x10 +; CHECK-GI-NEXT: csel x9, x14, xzr, lo +; CHECK-GI-NEXT: orr x1, x13, x15 ; CHECK-GI-NEXT: orr x2, x17, x8 -; CHECK-GI-NEXT: orr x3, x13, x9 +; CHECK-GI-NEXT: orr x3, x11, x9 ; CHECK-GI-NEXT: ret entry: %d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) @@ -3863,15 +3837,12 @@ define <2 x i128> @rotl_v2i128_c(<2 x i128> %a) { ; ; CHECK-GI-LABEL: rotl_v2i128_c: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsr x8, x1, #61 -; CHECK-GI-NEXT: lsl x9, x1, #3 -; CHECK-GI-NEXT: lsl x10, x3, #3 -; CHECK-GI-NEXT: lsr x11, x3, #61 -; CHECK-GI-NEXT: orr x8, x8, x0, lsl #3 -; CHECK-GI-NEXT: orr x1, x9, x0, lsr #61 -; CHECK-GI-NEXT: orr x3, x10, x2, lsr #61 -; CHECK-GI-NEXT: orr x2, x11, x2, lsl #3 +; CHECK-GI-NEXT: extr x8, x0, x1, #61 +; CHECK-GI-NEXT: extr x9, x3, x2, #61 +; CHECK-GI-NEXT: extr x1, x1, x0, #61 +; CHECK-GI-NEXT: extr x2, x2, x3, #61 ; CHECK-GI-NEXT: mov x0, x8 +; CHECK-GI-NEXT: mov x3, x9 ; CHECK-GI-NEXT: ret entry: %d = call <2 x i128> @llvm.fshl(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 3, i128 3>) @@ -3891,14 +3862,12 @@ define <2 x i128> @rotr_v2i128_c(<2 x i128> %a) { ; ; CHECK-GI-LABEL: rotr_v2i128_c: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsl x8, x1, #61 -; CHECK-GI-NEXT: lsl x9, x3, #61 -; CHECK-GI-NEXT: lsl x10, x0, #61 -; CHECK-GI-NEXT: lsl x11, x2, #61 -; CHECK-GI-NEXT: orr x0, x8, x0, lsr #3 -; CHECK-GI-NEXT: orr x2, x9, x2, lsr #3 -; CHECK-GI-NEXT: orr x1, x10, x1, lsr #3 -; CHECK-GI-NEXT: orr x3, x11, x3, lsr #3 +; CHECK-GI-NEXT: extr x8, x1, x0, #3 +; CHECK-GI-NEXT: extr x9, x3, x2, #3 +; CHECK-GI-NEXT: extr x1, x0, x1, #3 +; CHECK-GI-NEXT: extr x3, x2, x3, #3 +; CHECK-GI-NEXT: mov x0, x8 +; CHECK-GI-NEXT: mov x2, x9 ; CHECK-GI-NEXT: ret entry: %d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 3, i128 3>) @@ -4464,14 +4433,10 @@ define <2 x i128> @fshl_v2i128_c(<2 x i128> %a, <2 x i128> %b) { ; ; CHECK-GI-LABEL: fshl_v2i128_c: ; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsr x8, x5, #61 -; CHECK-GI-NEXT: lsl x9, x1, #3 -; CHECK-GI-NEXT: lsl x10, x3, #3 -; CHECK-GI-NEXT: lsr x11, x7, #61 -; CHECK-GI-NEXT: orr x8, x8, x0, lsl #3 -; CHECK-GI-NEXT: orr x1, x9, x0, lsr #61 -; CHECK-GI-NEXT: orr x3, x10, x2, lsr #61 -; CHECK-GI-NEXT: orr x2, x11, x2, lsl #3 +; CHECK-GI-NEXT: extr x8, x0, x5, #61 +; CHECK-GI-NEXT: extr x1, x1, x0, #61 +; CHECK-GI-NEXT: extr x3, x3, x2, #61 +; CHECK-GI-NEXT: extr x2, x2, x7, #61 ; CHECK-GI-NEXT: mov x0, x8 ; CHECK-GI-NEXT: ret entry: @@ -4480,29 +4445,15 @@ entry: } define <2 x i128> @fshr_v2i128_c(<2 x i128> %a, <2 x i128> %b) { -; CHECK-SD-LABEL: fshr_v2i128_c: -; CHECK-SD: // %bb.0: // %entry -; CHECK-SD-NEXT: extr x8, x5, x4, #3 -; CHECK-SD-NEXT: extr x9, x7, x6, #3 -; CHECK-SD-NEXT: extr x1, x0, x5, #3 -; CHECK-SD-NEXT: extr x3, x2, x7, #3 -; CHECK-SD-NEXT: mov x0, x8 -; CHECK-SD-NEXT: mov x2, x9 -; CHECK-SD-NEXT: ret -; -; CHECK-GI-LABEL: fshr_v2i128_c: -; CHECK-GI: // %bb.0: // %entry -; CHECK-GI-NEXT: lsl x8, x5, #61 -; CHECK-GI-NEXT: lsl x9, x7, #61 -; CHECK-GI-NEXT: lsr x10, x5, #3 -; CHECK-GI-NEXT: lsr x11, x7, #3 -; CHECK-GI-NEXT: orr x8, x8, x4, lsr #3 -; CHECK-GI-NEXT: orr x9, x9, x6, lsr #3 -; CHECK-GI-NEXT: orr x1, x10, x0, lsl #61 -; CHECK-GI-NEXT: orr x3, x11, x2, lsl #61 -; CHECK-GI-NEXT: mov x0, x8 -; CHECK-GI-NEXT: mov x2, x9 -; CHECK-GI-NEXT: ret +; CHECK-LABEL: fshr_v2i128_c: +; CHECK: // %bb.0: // %entry +; CHECK-NEXT: extr x8, x5, x4, #3 +; CHECK-NEXT: extr x9, x7, x6, #3 +; CHECK-NEXT: extr x1, x0, x5, #3 +; CHECK-NEXT: extr x3, x2, x7, #3 +; CHECK-NEXT: mov x0, x8 +; CHECK-NEXT: mov x2, x9 +; CHECK-NEXT: ret entry: %d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 3, i128 3>) ret <2 x i128> %d diff --git a/llvm/test/CodeGen/AArch64/funnel-shift.ll b/llvm/test/CodeGen/AArch64/funnel-shift.ll index f9fd2ad..90fb102 100644 --- a/llvm/test/CodeGen/AArch64/funnel-shift.ll +++ b/llvm/test/CodeGen/AArch64/funnel-shift.ll @@ -85,41 +85,40 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind { ; ; CHECK-GI-LABEL: fshl_i128: ; CHECK-GI: // %bb.0: +; CHECK-GI-NEXT: mov w8, #64 // =0x40 ; CHECK-GI-NEXT: and x9, x4, #0x7f -; CHECK-GI-NEXT: mov w10, #64 // =0x40 -; CHECK-GI-NEXT: lsl x14, x3, #63 -; CHECK-GI-NEXT: sub x12, x10, x9 +; CHECK-GI-NEXT: mov w10, #127 // =0x7f +; CHECK-GI-NEXT: sub x12, x8, x9 ; CHECK-GI-NEXT: lsl x13, x1, x9 -; CHECK-GI-NEXT: mov w8, #127 // =0x7f +; CHECK-GI-NEXT: bic x10, x10, x4 ; CHECK-GI-NEXT: lsr x12, x0, x12 -; CHECK-GI-NEXT: bic x8, x8, x4 -; CHECK-GI-NEXT: sub x15, x9, #64 +; CHECK-GI-NEXT: sub x14, x9, #64 +; CHECK-GI-NEXT: lsl x15, x0, x9 +; CHECK-GI-NEXT: extr x16, x3, x2, #1 ; CHECK-GI-NEXT: cmp x9, #64 -; CHECK-GI-NEXT: lsl x9, x0, x9 -; CHECK-GI-NEXT: lsl x15, x0, x15 -; CHECK-GI-NEXT: orr x12, x12, x13 -; CHECK-GI-NEXT: orr x13, x14, x2, lsr #1 -; CHECK-GI-NEXT: lsr x14, x3, #1 -; CHECK-GI-NEXT: sub x10, x10, x8 -; CHECK-GI-NEXT: sub x16, x8, #64 -; CHECK-GI-NEXT: csel x9, x9, xzr, lo -; CHECK-GI-NEXT: lsr x17, x13, x8 -; CHECK-GI-NEXT: lsl x10, x14, x10 -; CHECK-GI-NEXT: csel x12, x12, x15, lo +; CHECK-GI-NEXT: sub x8, x8, x10 +; CHECK-GI-NEXT: orr x9, x12, x13 +; CHECK-GI-NEXT: lsr x12, x3, #1 +; CHECK-GI-NEXT: lsl x13, x0, x14 +; CHECK-GI-NEXT: csel x14, x15, xzr, lo +; CHECK-GI-NEXT: sub x15, x10, #64 +; CHECK-GI-NEXT: lsr x17, x16, x10 +; CHECK-GI-NEXT: lsl x8, x12, x8 +; CHECK-GI-NEXT: csel x9, x9, x13, lo ; CHECK-GI-NEXT: tst x4, #0x7f -; CHECK-GI-NEXT: lsr x15, x14, x16 +; CHECK-GI-NEXT: lsr x13, x12, x15 ; CHECK-GI-NEXT: mvn x11, x4 -; CHECK-GI-NEXT: csel x12, x1, x12, eq -; CHECK-GI-NEXT: orr x10, x17, x10 -; CHECK-GI-NEXT: cmp x8, #64 -; CHECK-GI-NEXT: lsr x14, x14, x8 -; CHECK-GI-NEXT: csel x10, x10, x15, lo +; CHECK-GI-NEXT: csel x9, x1, x9, eq +; CHECK-GI-NEXT: orr x8, x17, x8 +; CHECK-GI-NEXT: cmp x10, #64 +; CHECK-GI-NEXT: lsr x12, x12, x10 +; CHECK-GI-NEXT: csel x8, x8, x13, lo ; CHECK-GI-NEXT: tst x11, #0x7f -; CHECK-GI-NEXT: csel x10, x13, x10, eq -; CHECK-GI-NEXT: cmp x8, #64 -; CHECK-GI-NEXT: csel x8, x14, xzr, lo -; CHECK-GI-NEXT: orr x0, x9, x10 -; CHECK-GI-NEXT: orr x1, x12, x8 +; CHECK-GI-NEXT: csel x8, x16, x8, eq +; CHECK-GI-NEXT: cmp x10, #64 +; CHECK-GI-NEXT: csel x10, x12, xzr, lo +; CHECK-GI-NEXT: orr x0, x14, x8 +; CHECK-GI-NEXT: orr x1, x9, x10 ; CHECK-GI-NEXT: ret %f = call i128 @llvm.fshl.i128(i128 %x, i128 %y, i128 %z) ret i128 %f diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll index 1cb92e4..87b1108 100644 --- a/llvm/test/CodeGen/AArch64/rem-by-const.ll +++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll @@ -559,20 +559,18 @@ define i128 @ui128_7(i128 %a, i128 %b) { ; CHECK-GI-NEXT: add x8, x8, x10 ; CHECK-GI-NEXT: subs x10, x0, x9 ; CHECK-GI-NEXT: sbc x11, x1, x8 -; CHECK-GI-NEXT: lsl x12, x11, #63 +; CHECK-GI-NEXT: extr x10, x11, x10, #1 ; CHECK-GI-NEXT: lsr x11, x11, #1 -; CHECK-GI-NEXT: orr x10, x12, x10, lsr #1 ; CHECK-GI-NEXT: adds x9, x10, x9 +; CHECK-GI-NEXT: mov w10, #7 // =0x7 ; CHECK-GI-NEXT: adc x8, x11, x8 -; CHECK-GI-NEXT: lsl x10, x8, #62 +; CHECK-GI-NEXT: extr x9, x8, x9, #2 ; CHECK-GI-NEXT: lsr x8, x8, #2 -; CHECK-GI-NEXT: orr x9, x10, x9, lsr #2 -; CHECK-GI-NEXT: mov w10, #7 // =0x7 -; CHECK-GI-NEXT: lsl x12, x8, #3 ; CHECK-GI-NEXT: umulh x10, x9, x10 ; CHECK-GI-NEXT: lsl x11, x9, #3 -; CHECK-GI-NEXT: sub x8, x12, x8 +; CHECK-GI-NEXT: lsl x12, x8, #3 ; CHECK-GI-NEXT: sub x9, x11, x9 +; CHECK-GI-NEXT: sub x8, x12, x8 ; CHECK-GI-NEXT: subs x0, x0, x9 ; CHECK-GI-NEXT: add x8, x8, x10 ; CHECK-GI-NEXT: sbc x1, x1, x8 @@ -640,10 +638,9 @@ define i128 @ui128_100(i128 %a, i128 %b) { ; CHECK-GI-NEXT: add x10, x11, x12 ; CHECK-GI-NEXT: add x8, x8, x14 ; CHECK-GI-NEXT: add x8, x8, x10 -; CHECK-GI-NEXT: lsl x10, x8, #60 -; CHECK-GI-NEXT: lsr x8, x8, #4 -; CHECK-GI-NEXT: orr x9, x10, x9, lsr #4 ; CHECK-GI-NEXT: mov w10, #100 // =0x64 +; CHECK-GI-NEXT: extr x9, x8, x9, #4 +; CHECK-GI-NEXT: lsr x8, x8, #4 ; CHECK-GI-NEXT: umulh x11, x9, x10 ; CHECK-GI-NEXT: mul x9, x9, x10 ; CHECK-GI-NEXT: madd x8, x8, x10, x11 @@ -3317,36 +3314,32 @@ define <2 x i128> @uv2i128_7(<2 x i128> %d, <2 x i128> %e) { ; CHECK-GI-NEXT: sbc x14, x1, x12 ; CHECK-GI-NEXT: add x8, x8, x13 ; CHECK-GI-NEXT: subs x13, x2, x10 -; CHECK-GI-NEXT: lsl x15, x14, #63 -; CHECK-GI-NEXT: sbc x16, x3, x8 +; CHECK-GI-NEXT: extr x9, x14, x9, #1 +; CHECK-GI-NEXT: sbc x15, x3, x8 ; CHECK-GI-NEXT: lsr x14, x14, #1 -; CHECK-GI-NEXT: orr x9, x15, x9, lsr #1 -; CHECK-GI-NEXT: lsl x15, x16, #63 -; CHECK-GI-NEXT: orr x13, x15, x13, lsr #1 +; CHECK-GI-NEXT: extr x13, x15, x13, #1 ; CHECK-GI-NEXT: adds x9, x9, x11 -; CHECK-GI-NEXT: lsr x11, x16, #1 +; CHECK-GI-NEXT: lsr x11, x15, #1 ; CHECK-GI-NEXT: adc x12, x14, x12 ; CHECK-GI-NEXT: adds x10, x13, x10 -; CHECK-GI-NEXT: lsl x13, x12, #62 -; CHECK-GI-NEXT: lsr x12, x12, #2 -; CHECK-GI-NEXT: adc x8, x11, x8 -; CHECK-GI-NEXT: lsl x11, x8, #62 -; CHECK-GI-NEXT: orr x9, x13, x9, lsr #2 +; CHECK-GI-NEXT: extr x9, x12, x9, #2 ; CHECK-GI-NEXT: mov w13, #7 // =0x7 +; CHECK-GI-NEXT: adc x8, x11, x8 +; CHECK-GI-NEXT: lsr x11, x12, #2 +; CHECK-GI-NEXT: extr x10, x8, x10, #2 +; CHECK-GI-NEXT: umulh x12, x9, x13 ; CHECK-GI-NEXT: lsr x8, x8, #2 -; CHECK-GI-NEXT: lsl x14, x12, #3 -; CHECK-GI-NEXT: orr x10, x11, x10, lsr #2 -; CHECK-GI-NEXT: umulh x11, x9, x13 +; CHECK-GI-NEXT: lsl x14, x11, #3 ; CHECK-GI-NEXT: lsl x15, x9, #3 -; CHECK-GI-NEXT: sub x12, x14, x12 -; CHECK-GI-NEXT: lsl x16, x8, #3 ; CHECK-GI-NEXT: umulh x13, x10, x13 +; CHECK-GI-NEXT: lsl x16, x8, #3 +; CHECK-GI-NEXT: sub x11, x14, x11 ; CHECK-GI-NEXT: lsl x14, x10, #3 ; CHECK-GI-NEXT: sub x9, x15, x9 ; CHECK-GI-NEXT: sub x8, x16, x8 ; CHECK-GI-NEXT: subs x0, x0, x9 +; CHECK-GI-NEXT: add x11, x11, x12 ; CHECK-GI-NEXT: sub x10, x14, x10 -; CHECK-GI-NEXT: add x11, x12, x11 ; CHECK-GI-NEXT: sbc x1, x1, x11 ; CHECK-GI-NEXT: subs x2, x2, x10 ; CHECK-GI-NEXT: add x8, x8, x13 @@ -3394,9 +3387,10 @@ define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) { ; CHECK-GI: // %bb.0: // %entry ; CHECK-GI-NEXT: mov x10, #23593 // =0x5c29 ; CHECK-GI-NEXT: mov x8, #62914 // =0xf5c2 -; CHECK-GI-NEXT: sub x18, x0, x0 +; CHECK-GI-NEXT: and x5, xzr, #0x1 ; CHECK-GI-NEXT: movk x10, #49807, lsl #16 ; CHECK-GI-NEXT: movk x8, #23592, lsl #16 +; CHECK-GI-NEXT: umulh x18, x0, xzr ; CHECK-GI-NEXT: movk x10, #10485, lsl #32 ; CHECK-GI-NEXT: movk x8, #49807, lsl #32 ; CHECK-GI-NEXT: movk x10, #36700, lsl #48 @@ -3409,84 +3403,81 @@ define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) { ; CHECK-GI-NEXT: umulh x15, x1, x10 ; CHECK-GI-NEXT: cset w12, hs ; CHECK-GI-NEXT: cmn x11, x13 -; CHECK-GI-NEXT: and x11, x12, #0x1 -; CHECK-GI-NEXT: umulh x16, x0, x8 -; CHECK-GI-NEXT: cset w12, hs +; CHECK-GI-NEXT: sub x13, x0, x0 ; CHECK-GI-NEXT: and x12, x12, #0x1 -; CHECK-GI-NEXT: add x14, x14, x18 -; CHECK-GI-NEXT: add x11, x11, x12 -; CHECK-GI-NEXT: and x12, xzr, #0x1 +; CHECK-GI-NEXT: umulh x16, x0, x8 +; CHECK-GI-NEXT: cset w11, hs +; CHECK-GI-NEXT: add x13, x14, x13 +; CHECK-GI-NEXT: and x11, x11, #0x1 +; CHECK-GI-NEXT: and x14, xzr, #0x1 ; CHECK-GI-NEXT: umulh x9, xzr, x10 -; CHECK-GI-NEXT: adds x14, x14, x15 -; CHECK-GI-NEXT: and x15, xzr, #0x1 +; CHECK-GI-NEXT: add x11, x12, x11 +; CHECK-GI-NEXT: add x12, x5, x14 +; CHECK-GI-NEXT: adds x13, x13, x15 ; CHECK-GI-NEXT: umulh x17, x1, x8 -; CHECK-GI-NEXT: cset w4, hs -; CHECK-GI-NEXT: add x15, x12, x15 -; CHECK-GI-NEXT: adds x12, x14, x16 -; CHECK-GI-NEXT: and x4, x4, #0x1 -; CHECK-GI-NEXT: mul x18, x3, x10 ; CHECK-GI-NEXT: cset w14, hs -; CHECK-GI-NEXT: adds x12, x12, x11 -; CHECK-GI-NEXT: add x11, x15, x4 ; CHECK-GI-NEXT: and x14, x14, #0x1 -; CHECK-GI-NEXT: cset w15, hs -; CHECK-GI-NEXT: mul x5, x2, x8 -; CHECK-GI-NEXT: add x11, x11, x14 -; CHECK-GI-NEXT: and x14, x15, #0x1 -; CHECK-GI-NEXT: add x17, x9, x17 -; CHECK-GI-NEXT: add x14, x11, x14 -; CHECK-GI-NEXT: mov w11, #100 // =0x64 -; CHECK-GI-NEXT: umulh x13, x0, xzr -; CHECK-GI-NEXT: umulh x16, x2, x10 -; CHECK-GI-NEXT: adds x18, x18, x5 -; CHECK-GI-NEXT: mul x15, x3, x8 -; CHECK-GI-NEXT: add x13, x17, x13 -; CHECK-GI-NEXT: cset w17, hs -; CHECK-GI-NEXT: umulh x10, x3, x10 -; CHECK-GI-NEXT: add x13, x13, x14 -; CHECK-GI-NEXT: and x17, x17, #0x1 -; CHECK-GI-NEXT: cmn x18, x16 -; CHECK-GI-NEXT: sub x18, x2, x2 -; CHECK-GI-NEXT: umulh x16, x2, x8 +; CHECK-GI-NEXT: adds x13, x13, x16 +; CHECK-GI-NEXT: mul x4, x3, x10 +; CHECK-GI-NEXT: add x12, x12, x14 ; CHECK-GI-NEXT: cset w14, hs -; CHECK-GI-NEXT: and x14, x14, #0x1 -; CHECK-GI-NEXT: add x15, x15, x18 +; CHECK-GI-NEXT: adds x11, x13, x11 +; CHECK-GI-NEXT: and x13, x14, #0x1 +; CHECK-GI-NEXT: mul x15, x2, x8 +; CHECK-GI-NEXT: cset w14, hs +; CHECK-GI-NEXT: add x12, x12, x13 +; CHECK-GI-NEXT: and x13, x14, #0x1 +; CHECK-GI-NEXT: add x14, x9, x17 +; CHECK-GI-NEXT: sub x17, x2, x2 +; CHECK-GI-NEXT: umulh x16, x2, x10 +; CHECK-GI-NEXT: add x12, x12, x13 +; CHECK-GI-NEXT: add x13, x14, x18 +; CHECK-GI-NEXT: add x12, x13, x12 ; CHECK-GI-NEXT: and x18, xzr, #0x1 -; CHECK-GI-NEXT: add x14, x17, x14 +; CHECK-GI-NEXT: mul x5, x3, x8 +; CHECK-GI-NEXT: extr x11, x12, x11, #4 +; CHECK-GI-NEXT: adds x13, x4, x15 +; CHECK-GI-NEXT: umulh x14, x3, x10 +; CHECK-GI-NEXT: cset w15, hs +; CHECK-GI-NEXT: mov w10, #100 // =0x64 +; CHECK-GI-NEXT: cmn x13, x16 +; CHECK-GI-NEXT: and x15, x15, #0x1 +; CHECK-GI-NEXT: umulh x13, x2, x8 +; CHECK-GI-NEXT: cset w16, hs +; CHECK-GI-NEXT: add x17, x5, x17 +; CHECK-GI-NEXT: and x16, x16, #0x1 ; CHECK-GI-NEXT: umulh x8, x3, x8 +; CHECK-GI-NEXT: add x15, x15, x16 +; CHECK-GI-NEXT: adds x14, x17, x14 ; CHECK-GI-NEXT: and x17, xzr, #0x1 -; CHECK-GI-NEXT: adds x10, x15, x10 -; CHECK-GI-NEXT: add x15, x17, x18 +; CHECK-GI-NEXT: add x16, x18, x17 ; CHECK-GI-NEXT: cset w17, hs -; CHECK-GI-NEXT: umulh x18, x2, xzr +; CHECK-GI-NEXT: adds x13, x14, x13 +; CHECK-GI-NEXT: umulh x14, x2, xzr ; CHECK-GI-NEXT: and x17, x17, #0x1 -; CHECK-GI-NEXT: adds x10, x10, x16 -; CHECK-GI-NEXT: lsl x16, x13, #60 -; CHECK-GI-NEXT: add x15, x15, x17 -; CHECK-GI-NEXT: cset w17, hs -; CHECK-GI-NEXT: adds x10, x10, x14 -; CHECK-GI-NEXT: and x14, x17, #0x1 +; CHECK-GI-NEXT: cset w18, hs +; CHECK-GI-NEXT: adds x13, x13, x15 +; CHECK-GI-NEXT: add x15, x16, x17 +; CHECK-GI-NEXT: and x16, x18, #0x1 ; CHECK-GI-NEXT: cset w17, hs ; CHECK-GI-NEXT: add x8, x9, x8 -; CHECK-GI-NEXT: add x14, x15, x14 -; CHECK-GI-NEXT: and x15, x17, #0x1 -; CHECK-GI-NEXT: orr x12, x16, x12, lsr #4 -; CHECK-GI-NEXT: add x9, x14, x15 -; CHECK-GI-NEXT: add x8, x8, x18 -; CHECK-GI-NEXT: add x8, x8, x9 -; CHECK-GI-NEXT: lsr x9, x13, #4 -; CHECK-GI-NEXT: umulh x14, x12, x11 -; CHECK-GI-NEXT: lsl x13, x8, #60 +; CHECK-GI-NEXT: add x15, x15, x16 +; CHECK-GI-NEXT: and x16, x17, #0x1 +; CHECK-GI-NEXT: lsr x9, x12, #4 +; CHECK-GI-NEXT: add x15, x15, x16 +; CHECK-GI-NEXT: umulh x17, x11, x10 +; CHECK-GI-NEXT: add x8, x8, x14 +; CHECK-GI-NEXT: add x8, x8, x15 +; CHECK-GI-NEXT: mul x11, x11, x10 +; CHECK-GI-NEXT: extr x12, x8, x13, #4 ; CHECK-GI-NEXT: lsr x8, x8, #4 -; CHECK-GI-NEXT: mul x12, x12, x11 -; CHECK-GI-NEXT: orr x10, x13, x10, lsr #4 -; CHECK-GI-NEXT: madd x9, x9, x11, x14 -; CHECK-GI-NEXT: umulh x13, x10, x11 -; CHECK-GI-NEXT: subs x0, x0, x12 -; CHECK-GI-NEXT: mul x10, x10, x11 +; CHECK-GI-NEXT: madd x9, x9, x10, x17 +; CHECK-GI-NEXT: umulh x13, x12, x10 +; CHECK-GI-NEXT: subs x0, x0, x11 +; CHECK-GI-NEXT: mul x12, x12, x10 ; CHECK-GI-NEXT: sbc x1, x1, x9 -; CHECK-GI-NEXT: madd x8, x8, x11, x13 -; CHECK-GI-NEXT: subs x2, x2, x10 +; CHECK-GI-NEXT: madd x8, x8, x10, x13 +; CHECK-GI-NEXT: subs x2, x2, x12 ; CHECK-GI-NEXT: sbc x3, x3, x8 ; CHECK-GI-NEXT: ret entry: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll index 221e2fd..09e1fca 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll @@ -1200,7 +1200,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3 ; GFX6-NEXT: s_mov_b32 s5, s7 ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 -; GFX6-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc +; GFX6-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_cmpswap_i32_1d_no_return: @@ -1213,7 +1213,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3 ; GFX8-NEXT: s_mov_b32 s5, s7 ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 -; GFX8-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc +; GFX8-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm ; GFX8-NEXT: s_endpgm ; ; GFX900-LABEL: atomic_cmpswap_i32_1d_no_return: @@ -1226,7 +1226,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3 ; GFX900-NEXT: s_mov_b32 s5, s7 ; GFX900-NEXT: s_mov_b32 s6, s8 ; GFX900-NEXT: s_mov_b32 s7, s9 -; GFX900-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc +; GFX900-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm ; GFX900-NEXT: s_endpgm ; ; GFX90A-LABEL: atomic_cmpswap_i32_1d_no_return: @@ -1239,7 +1239,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3 ; GFX90A-NEXT: s_mov_b32 s5, s7 ; GFX90A-NEXT: s_mov_b32 s6, s8 ; GFX90A-NEXT: s_mov_b32 s7, s9 -; GFX90A-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc +; GFX90A-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm ; GFX90A-NEXT: s_endpgm ; ; GFX10PLUS-LABEL: atomic_cmpswap_i32_1d_no_return: @@ -1252,7 +1252,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3 ; GFX10PLUS-NEXT: s_mov_b32 s5, s7 ; GFX10PLUS-NEXT: s_mov_b32 s6, s8 ; GFX10PLUS-NEXT: s_mov_b32 s7, s9 -; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc +; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm ; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_cmpswap_i32_1d_no_return: @@ -1265,7 +1265,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3 ; GFX12-NEXT: s_mov_b32 s5, s7 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 -; GFX12-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN +; GFX12-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D ; GFX12-NEXT: s_endpgm main_body: %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -3194,7 +3194,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6 ; GFX6-NEXT: s_mov_b32 s5, s7 ; GFX6-NEXT: s_mov_b32 s6, s8 ; GFX6-NEXT: s_mov_b32 s7, s9 -; GFX6-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc +; GFX6-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm ; GFX6-NEXT: s_endpgm ; ; GFX8-LABEL: atomic_cmpswap_i64_1d_no_return: @@ -3207,7 +3207,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6 ; GFX8-NEXT: s_mov_b32 s5, s7 ; GFX8-NEXT: s_mov_b32 s6, s8 ; GFX8-NEXT: s_mov_b32 s7, s9 -; GFX8-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc +; GFX8-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm ; GFX8-NEXT: s_endpgm ; ; GFX900-LABEL: atomic_cmpswap_i64_1d_no_return: @@ -3220,7 +3220,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6 ; GFX900-NEXT: s_mov_b32 s5, s7 ; GFX900-NEXT: s_mov_b32 s6, s8 ; GFX900-NEXT: s_mov_b32 s7, s9 -; GFX900-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc +; GFX900-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm ; GFX900-NEXT: s_endpgm ; ; GFX90A-LABEL: atomic_cmpswap_i64_1d_no_return: @@ -3233,7 +3233,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6 ; GFX90A-NEXT: s_mov_b32 s5, s7 ; GFX90A-NEXT: s_mov_b32 s6, s8 ; GFX90A-NEXT: s_mov_b32 s7, s9 -; GFX90A-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc +; GFX90A-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm ; GFX90A-NEXT: s_endpgm ; ; GFX10PLUS-LABEL: atomic_cmpswap_i64_1d_no_return: @@ -3246,7 +3246,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6 ; GFX10PLUS-NEXT: s_mov_b32 s5, s7 ; GFX10PLUS-NEXT: s_mov_b32 s6, s8 ; GFX10PLUS-NEXT: s_mov_b32 s7, s9 -; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc +; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm ; GFX10PLUS-NEXT: s_endpgm ; ; GFX12-LABEL: atomic_cmpswap_i64_1d_no_return: @@ -3259,7 +3259,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6 ; GFX12-NEXT: s_mov_b32 s5, s7 ; GFX12-NEXT: s_mov_b32 s6, s8 ; GFX12-NEXT: s_mov_b32 s7, s9 -; GFX12-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN +; GFX12-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D ; GFX12-NEXT: s_endpgm main_body: %v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir index 292fa4b..4f160b6 100644 --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir @@ -25,6 +25,7 @@ body: | ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_si]].sub0 ; GFX6-NEXT: $vgpr0 = COPY [[COPY3]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; ; GFX8-LABEL: name: atomic_cmpswap_i32_1d ; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2 ; GFX8-NEXT: {{ $}} @@ -35,6 +36,7 @@ body: | ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_vi]].sub0 ; GFX8-NEXT: $vgpr0 = COPY [[COPY3]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; ; GFX10-LABEL: name: atomic_cmpswap_i32_1d ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} @@ -45,6 +47,7 @@ body: | ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10_]].sub0 ; GFX10-NEXT: $vgpr0 = COPY [[COPY3]] ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; ; GFX11-LABEL: name: atomic_cmpswap_i32_1d ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} @@ -55,6 +58,7 @@ body: | ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11_]].sub0 ; GFX11-NEXT: $vgpr0 = COPY [[COPY3]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0 + ; ; GFX12-LABEL: name: atomic_cmpswap_i32_1d ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2 ; GFX12-NEXT: {{ $}} @@ -89,39 +93,43 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX6-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_si:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_si [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) + ; GFX6-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_si [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) ; GFX6-NEXT: S_ENDPGM 0 + ; ; GFX8-LABEL: name: atomic_cmpswap_i32_1d_no_return ; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX8-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V1_V1_vi:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) + ; GFX8-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) ; GFX8-NEXT: S_ENDPGM 0 + ; ; GFX10-LABEL: name: atomic_cmpswap_i32_1d_no_return ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX10-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10_:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) + ; GFX10-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) ; GFX10-NEXT: S_ENDPGM 0 + ; ; GFX11-LABEL: name: atomic_cmpswap_i32_1d_no_return ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX11-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11_:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) + ; GFX11-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) ; GFX11-NEXT: S_ENDPGM 0 + ; ; GFX12-LABEL: name: atomic_cmpswap_i32_1d_no_return ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2 - ; GFX12-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx12_:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) + ; GFX12-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8) ; GFX12-NEXT: S_ENDPGM 0 %0:sgpr(<8 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 %1:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1 @@ -150,6 +158,7 @@ body: | ; GFX6-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_si]].sub0_sub1 ; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]] ; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1 + ; ; GFX8-LABEL: name: atomic_cmpswap_i64_1d ; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; GFX8-NEXT: {{ $}} @@ -160,6 +169,7 @@ body: | ; GFX8-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_vi]].sub0_sub1 ; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]] ; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1 + ; ; GFX10-LABEL: name: atomic_cmpswap_i64_1d ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; GFX10-NEXT: {{ $}} @@ -170,6 +180,7 @@ body: | ; GFX10-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx10_]].sub0_sub1 ; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]] ; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1 + ; ; GFX11-LABEL: name: atomic_cmpswap_i64_1d ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; GFX11-NEXT: {{ $}} @@ -180,6 +191,7 @@ body: | ; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx11_]].sub0_sub1 ; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]] ; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1 + ; ; GFX12-LABEL: name: atomic_cmpswap_i64_1d ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; GFX12-NEXT: {{ $}} @@ -214,39 +226,43 @@ body: | ; GFX6-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX6-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_si:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_si [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) + ; GFX6-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_si [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) ; GFX6-NEXT: S_ENDPGM 0 + ; ; GFX8-LABEL: name: atomic_cmpswap_i64_1d_no_return ; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; GFX8-NEXT: {{ $}} ; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX8-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_vi:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) + ; GFX8-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) ; GFX8-NEXT: S_ENDPGM 0 + ; ; GFX10-LABEL: name: atomic_cmpswap_i64_1d_no_return ; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; GFX10-NEXT: {{ $}} ; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX10-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx10_:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) + ; GFX10-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) ; GFX10-NEXT: S_ENDPGM 0 + ; ; GFX11-LABEL: name: atomic_cmpswap_i64_1d_no_return ; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; GFX11-NEXT: {{ $}} ; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX11-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx11_:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) + ; GFX11-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) ; GFX11-NEXT: S_ENDPGM 0 + ; ; GFX12-LABEL: name: atomic_cmpswap_i64_1d_no_return ; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4 ; GFX12-NEXT: {{ $}} ; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4 - ; GFX12-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12_:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) + ; GFX12-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8) ; GFX12-NEXT: S_ENDPGM 0 %0:sgpr(<8 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 %1:vgpr(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll index 6c4f504..33ce278 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll @@ -23,7 +23,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero(ptr addrspace(1) ; PASS-CHECK: [[WHILE]]: ; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] ; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]]) ; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true +; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0 ; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]] ; PASS-CHECK: [[IF]]: ; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 @@ -75,7 +77,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero_swap_op(ptr addrs ; PASS-CHECK: [[WHILE]]: ; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] ; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]]) ; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true +; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 0, [[BALLOT]] ; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]] ; PASS-CHECK: [[IF]]: ; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 @@ -126,6 +130,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero(ptr addrspace(1) ; PASS-CHECK-NEXT: br label %[[WHILE:.*]] ; PASS-CHECK: [[WHILE]]: ; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[DONE]]) +; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp ne i64 0, [[BALLOT]] ; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]] ; PASS-CHECK: [[IF]]: ; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 @@ -175,6 +181,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero_swap(ptr addrspac ; PASS-CHECK-NEXT: br label %[[WHILE:.*]] ; PASS-CHECK: [[WHILE]]: ; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[DONE]]) +; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp ne i64 [[BALLOT]], 0 ; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]] ; PASS-CHECK: [[IF]]: ; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 @@ -225,7 +233,9 @@ define protected amdgpu_kernel void @trivial_uniform_waterfall(ptr addrspace(1) ; PASS-CHECK: [[WHILE]]: ; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ] ; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]]) ; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true +; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0 ; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]] ; PASS-CHECK: [[IF]]: ; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 0, 0 @@ -292,7 +302,9 @@ define protected amdgpu_kernel void @uniform_waterfall(ptr addrspace(1) %out, i3 ; PASS-CHECK: [[WHILE]]: ; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ] ; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]]) ; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true +; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0 ; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]] ; PASS-CHECK: [[IF]]: ; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[MYMASK]], [[MYMASK]] @@ -359,7 +371,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero_i32(ptr addrspace ; PASS-CHECK: [[WHILE]]: ; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] ; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[NOT_DONE]]) ; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true +; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i32 [[BALLOT]], 0 ; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]] ; PASS-CHECK: [[IF]]: ; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 @@ -410,6 +424,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero_i32(ptr addrspace ; PASS-CHECK-NEXT: br label %[[WHILE:.*]] ; PASS-CHECK: [[WHILE]]: ; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ] +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[DONE]]) +; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp ne i32 0, [[BALLOT]] ; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]] ; PASS-CHECK: [[IF]]: ; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4 diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll index aa11574..a3e42e5 100644 --- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll +++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll @@ -595,6 +595,8 @@ define amdgpu_kernel void @ballot_i32(i32 %v, ptr addrspace(1) %out) { ; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i32( ; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { ; PASS-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1 +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[C]]) +; PASS-CHECK-NEXT: [[BALLOT_NE_ZERO:%.*]] = icmp ne i32 [[BALLOT]], 0 ; PASS-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1 ; PASS-CHECK-NEXT: ret void ; @@ -623,6 +625,8 @@ define amdgpu_kernel void @ballot_i64(i32 %v, ptr addrspace(1) %out) { ; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i64( ; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] { ; PASS-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1 +; PASS-CHECK-NEXT: [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[C]]) +; PASS-CHECK-NEXT: [[BALLOT_NE_ZERO:%.*]] = icmp ne i64 [[BALLOT]], 0 ; PASS-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1 ; PASS-CHECK-NEXT: ret void ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll index 49607e3..83f0229 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll @@ -92,8 +92,7 @@ define amdgpu_ps void @atomic_swap_1d_agpr_noret(<8 x i32> inreg %rsrc, i32 %s) ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v1, a0 -; GFX90A-NEXT: image_atomic_swap v1, v0, s[0:7] dmask:0x1 unorm glc +; GFX90A-NEXT: image_atomic_swap a0, v0, s[0:7] dmask:0x1 unorm ; GFX90A-NEXT: s_endpgm %data = call i32 asm "; def $0", "=a"() %unused = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -106,8 +105,7 @@ define amdgpu_ps void @atomic_add_2d_agpr_noret(<8 x i32> inreg %rsrc, i32 %s, i ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a0 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: image_atomic_add v2, v[0:1], s[0:7] dmask:0x1 unorm glc +; GFX90A-NEXT: image_atomic_add a0, v[0:1], s[0:7] dmask:0x1 unorm ; GFX90A-NEXT: s_endpgm %data = call i32 asm "; def $0", "=a"() %unused = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) @@ -123,9 +121,7 @@ define amdgpu_ps void @atomic_cmpswap_1d_agpr_noret(<8 x i32> inreg %rsrc, i32 % ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a1 ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: image_atomic_cmpswap v[2:3], v0, s[0:7] dmask:0x3 unorm glc +; GFX90A-NEXT: image_atomic_cmpswap a[0:1], v0, s[0:7] dmask:0x3 unorm ; GFX90A-NEXT: s_endpgm %cmp = call i32 asm "; def $0", "=a"() %swap = call i32 asm "; def $0", "=a"() @@ -139,9 +135,7 @@ define amdgpu_ps void @atomic_swap_1d_i64_agpr_noret(<8 x i32> inreg %rsrc, i32 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 -; GFX90A-NEXT: image_atomic_swap v[2:3], v0, s[0:7] dmask:0x3 unorm glc +; GFX90A-NEXT: image_atomic_swap a[0:1], v0, s[0:7] dmask:0x3 unorm ; GFX90A-NEXT: s_endpgm %data = call i64 asm "; def $0", "=a"() %unused = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) @@ -154,14 +148,10 @@ define amdgpu_ps void @atomic_cmpswap_1d_64_agpr_noret(<8 x i32> inreg %rsrc, i3 ; GFX90A-NEXT: ;;#ASMSTART ; GFX90A-NEXT: ; def a[0:1] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0 ; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def a[0:1] +; GFX90A-NEXT: ; def a[2:3] ; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1 -; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0 -; GFX90A-NEXT: image_atomic_cmpswap v[2:5], v0, s[0:7] dmask:0xf unorm glc +; GFX90A-NEXT: image_atomic_cmpswap a[0:3], v0, s[0:7] dmask:0xf unorm ; GFX90A-NEXT: s_endpgm %cmp = call i64 asm "; def $0", "=a"() %swap = call i64 asm "; def $0", "=a"() diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.noret.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.noret.ll new file mode 100644 index 0000000..6c58a1a --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.noret.ll @@ -0,0 +1,581 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5 +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX10PLUS-GISE %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX10PLUS %s +; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISE %s +; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s + +define amdgpu_ps void @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_swap_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_swap_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_swap_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_swap_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_swap_1d_i64(<8 x i32> inreg %rsrc, i64 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_swap_1d_i64: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_swap_1d_i64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_swap_1d_i64: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_swap_1d_i64: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_swap_1d_float(<8 x i32> inreg %rsrc, float %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_swap_1d_float: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_swap_1d_float: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_swap_1d_float: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_swap_1d_float: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call float @llvm.amdgcn.image.atomic.swap.1d.f32.i32(float %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_add_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_sub_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_sub_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_sub v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_sub_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_sub v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_sub_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_sub_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_sub_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_sub_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_smin_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_smin_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_smin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_smin_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_smin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_smin_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_min_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_smin_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_min_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_umin_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_umin_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_umin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_umin_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_umin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_umin_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_min_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umin_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_min_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_smax_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_smax_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_smax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_smax_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_smax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_smax_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_max_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_smax_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_max_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_umax_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_umax_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_umax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_umax_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_umax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_umax_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_max_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_umax_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_max_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_and_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_and_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_and_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_and_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_and_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_or_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_or_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_or_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_or_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_or_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_xor_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_xor_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_xor_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_xor_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_xor_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_inc_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_inc_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_inc v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_inc_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_inc v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_inc_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_inc_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_inc_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_inc_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_dec_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_dec_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_dec v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_dec_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_dec v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_dec_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_dec_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_dec_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_dec_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_cmpswap_1d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %swap, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_cmpswap_1d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_cmpswap_1d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_cmpswap_1d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpswap_1d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_cmpswap_1d_64(<8 x i32> inreg %rsrc, i64 %cmp, i64 %swap, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_cmpswap_1d_64: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_cmpswap_1d_64: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_cmpswap_1d_64: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_cmpswap_1d_64: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D +; GFX12-NEXT: s_endpgm + %v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_2d(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t) { +; GFX10PLUS-GISE-LABEL: atomic_add_2d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_2d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_2d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_2d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_3d(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %r) { +; GFX10PLUS-GISE-LABEL: atomic_add_3d: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_3d: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_3d: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_3d: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.3d.i32.i32(i32 %data, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_cube(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %face) { +; GFX10PLUS-GISE-LABEL: atomic_add_cube: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_cube: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_cube: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_cube: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i32(i32 %data, i32 %s, i32 %t, i32 %face, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_1darray(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %slice) { +; GFX10PLUS-GISE-LABEL: atomic_add_1darray: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_1darray: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_1darray: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_1darray: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i32(i32 %data, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_2darray(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %slice) { +; GFX10PLUS-GISE-LABEL: atomic_add_2darray: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_2darray: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_2darray: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_2darray: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %fragid) { +; GFX10PLUS-GISE-LABEL: atomic_add_2dmsaa: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_2dmsaa: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_2dmsaa: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_2dmsaa: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_2darraymsaa(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid) { +; GFX10PLUS-GISE-LABEL: atomic_add_2darraymsaa: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_2darraymsaa: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_2darraymsaa: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2, v3, v4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_2darraymsaa: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2, v3, v4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0) + ret void +} + +define amdgpu_ps void @atomic_add_1d_slc(<8 x i32> inreg %rsrc, i32 %data, i32 %s) { +; GFX10PLUS-GISE-LABEL: atomic_add_1d_slc: +; GFX10PLUS-GISE: ; %bb.0: +; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm slc +; GFX10PLUS-GISE-NEXT: s_endpgm +; +; GFX10PLUS-LABEL: atomic_add_1d_slc: +; GFX10PLUS: ; %bb.0: +; GFX10PLUS-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm slc +; GFX10PLUS-NEXT: s_endpgm +; +; GFX12-GISE-LABEL: atomic_add_1d_slc: +; GFX12-GISE: ; %bb.0: +; GFX12-GISE-NEXT: image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT +; GFX12-GISE-NEXT: s_endpgm +; +; GFX12-LABEL: atomic_add_1d_slc: +; GFX12: ; %bb.0: +; GFX12-NEXT: image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT +; GFX12-NEXT: s_endpgm + %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 2) + ret void +} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll index 3d1d6c8..0ba62e4 100644 --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll @@ -41,15 +41,13 @@ main_body: define amdgpu_ps float @atomic_pk_add_f16_1d_v2_noret(<8 x i32> inreg %rsrc, <2 x half> %data, i32 %s) { ; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v2_noret: ; GFX12-SDAG: ; %bb.0: ; %main_body -; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: atomic_pk_add_f16_1d_v2_noret: ; GFX12-GISEL: ; %bb.0: ; %main_body -; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: @@ -79,15 +77,13 @@ main_body: define amdgpu_ps float @atomic_pk_add_f16_1d_v4_noret(<8 x i32> inreg %rsrc, <4 x half> %data, i32 %s) { ; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v4_noret: ; GFX12-SDAG: ; %bb.0: ; %main_body -; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: atomic_pk_add_f16_1d_v4_noret: ; GFX12-GISEL: ; %bb.0: ; %main_body -; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: @@ -126,15 +122,13 @@ main_body: define amdgpu_ps float @atomic_pk_add_bf16_1d_v2_noret(<8 x i32> inreg %rsrc, <2 x bfloat> %data, i32 %s) { ; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v2_noret: ; GFX12-SDAG: ; %bb.0: ; %main_body -; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v2_noret: ; GFX12-GISEL: ; %bb.0: ; %main_body -; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: @@ -173,15 +167,13 @@ main_body: define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_noret(<8 x i32> inreg %rsrc, <4 x bfloat> %data, i32 %s) { ; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v4_noret: ; GFX12-SDAG: ; %bb.0: ; %main_body -; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4_noret: ; GFX12-GISEL: ; %bb.0: ; %main_body -; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: @@ -192,15 +184,13 @@ main_body: define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_nt(<8 x i32> inreg %rsrc, <4 x bfloat> %data, i32 %s) { ; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v4_nt: ; GFX12-SDAG: ; %bb.0: ; %main_body -; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN -; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0 +; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT ; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-SDAG-NEXT: ; return to shader part epilog ; ; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4_nt: ; GFX12-GISEL: ; %bb.0: ; %main_body -; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN -; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0 +; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT ; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX12-GISEL-NEXT: ; return to shader part epilog main_body: diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/wave-ops.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/wave-ops.ll index 7a876f6..3544017 100644 --- a/llvm/test/CodeGen/DirectX/ShaderFlags/wave-ops.ll +++ b/llvm/test/CodeGen/DirectX/ShaderFlags/wave-ops.ll @@ -76,6 +76,20 @@ entry: ret i32 %ret } +define noundef i32 @wave_reduce_min(i32 noundef %x) { +entry: + ; CHECK: Function wave_reduce_min : [[WAVE_FLAG]] + %ret = call i32 @llvm.dx.wave.reduce.min.i32(i32 %x) + ret i32 %ret +} + +define noundef i32 @wave_reduce_umin(i32 noundef %x) { +entry: + ; CHECK: Function wave_reduce_umin : [[WAVE_FLAG]] + %ret = call i32 @llvm.dx.wave.reduce.umin.i32(i32 %x) + ret i32 %ret +} + define void @wave_active_countbits(i1 %expr) { entry: ; CHECK: Function wave_active_countbits : [[WAVE_FLAG]] diff --git a/llvm/test/CodeGen/DirectX/WaveActiveMin.ll b/llvm/test/CodeGen/DirectX/WaveActiveMin.ll new file mode 100644 index 0000000..24fde48 --- /dev/null +++ b/llvm/test/CodeGen/DirectX/WaveActiveMin.ll @@ -0,0 +1,143 @@ +; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library < %s | FileCheck %s + +; Test that for scalar values, WaveActiveMin maps down to the DirectX op + +define noundef half @wave_active_min_half(half noundef %expr) { +entry: +; CHECK: call half @dx.op.waveActiveOp.f16(i32 119, half %expr, i8 2, i8 0){{$}} + %ret = call half @llvm.dx.wave.reduce.min.f16(half %expr) + ret half %ret +} + +define noundef float @wave_active_min_float(float noundef %expr) { +entry: +; CHECK: call float @dx.op.waveActiveOp.f32(i32 119, float %expr, i8 2, i8 0){{$}} + %ret = call float @llvm.dx.wave.reduce.min.f32(float %expr) + ret float %ret +} + +define noundef double @wave_active_min_double(double noundef %expr) { +entry: +; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr, i8 2, i8 0){{$}} + %ret = call double @llvm.dx.wave.reduce.min.f64(double %expr) + ret double %ret +} + +define noundef i16 @wave_active_min_i16(i16 noundef %expr) { +entry: +; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr, i8 2, i8 0){{$}} + %ret = call i16 @llvm.dx.wave.reduce.min.i16(i16 %expr) + ret i16 %ret +} + +define noundef i32 @wave_active_min_i32(i32 noundef %expr) { +entry: +; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr, i8 2, i8 0){{$}} + %ret = call i32 @llvm.dx.wave.reduce.min.i32(i32 %expr) + ret i32 %ret +} + +define noundef i64 @wave_active_min_i64(i64 noundef %expr) { +entry: +; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr, i8 2, i8 0){{$}} + %ret = call i64 @llvm.dx.wave.reduce.min.i64(i64 %expr) + ret i64 %ret +} + +define noundef i16 @wave_active_umin_i16(i16 noundef %expr) { +entry: +; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr, i8 2, i8 1){{$}} + %ret = call i16 @llvm.dx.wave.reduce.umin.i16(i16 %expr) + ret i16 %ret +} + +define noundef i32 @wave_active_umin_i32(i32 noundef %expr) { +entry: +; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr, i8 2, i8 1){{$}} + %ret = call i32 @llvm.dx.wave.reduce.umin.i32(i32 %expr) + ret i32 %ret +} + +define noundef i64 @wave_active_umin_i64(i64 noundef %expr) { +entry: +; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr, i8 2, i8 1){{$}} + %ret = call i64 @llvm.dx.wave.reduce.umin.i64(i64 %expr) + ret i64 %ret +} + +declare half @llvm.dx.wave.reduce.min.f16(half) +declare float @llvm.dx.wave.reduce.min.f32(float) +declare double @llvm.dx.wave.reduce.min.f64(double) + +declare i16 @llvm.dx.wave.reduce.min.i16(i16) +declare i32 @llvm.dx.wave.reduce.min.i32(i32) +declare i64 @llvm.dx.wave.reduce.min.i64(i64) + +declare i16 @llvm.dx.wave.reduce.umin.i16(i16) +declare i32 @llvm.dx.wave.reduce.umin.i32(i32) +declare i64 @llvm.dx.wave.reduce.umin.i64(i64) + +; Test that for vector values, WaveActiveMin scalarizes and maps down to the +; DirectX op + +define noundef <2 x half> @wave_active_min_v2half(<2 x half> noundef %expr) { +entry: +; CHECK: call half @dx.op.waveActiveOp.f16(i32 119, half %expr.i0, i8 2, i8 0){{$}} +; CHECK: call half @dx.op.waveActiveOp.f16(i32 119, half %expr.i1, i8 2, i8 0){{$}} + %ret = call <2 x half> @llvm.dx.wave.reduce.min.v2f16(<2 x half> %expr) + ret <2 x half> %ret +} + +define noundef <3 x i32> @wave_active_min_v3i32(<3 x i32> noundef %expr) { +entry: +; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i0, i8 2, i8 0){{$}} +; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i1, i8 2, i8 0){{$}} +; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i2, i8 2, i8 0){{$}} + %ret = call <3 x i32> @llvm.dx.wave.reduce.min.v3i32(<3 x i32> %expr) + ret <3 x i32> %ret +} + +define noundef <4 x double> @wave_active_min_v4f64(<4 x double> noundef %expr) { +entry: +; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i0, i8 2, i8 0){{$}} +; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i1, i8 2, i8 0){{$}} +; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i2, i8 2, i8 0){{$}} +; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i3, i8 2, i8 0){{$}} + %ret = call <4 x double> @llvm.dx.wave.reduce.min.v4f64(<4 x double> %expr) + ret <4 x double> %ret +} + +declare <2 x half> @llvm.dx.wave.reduce.min.v2f16(<2 x half>) +declare <3 x i32> @llvm.dx.wave.reduce.min.v3i32(<3 x i32>) +declare <4 x double> @llvm.dx.wave.reduce.min.v4f64(<4 x double>) + +define noundef <2 x i16> @wave_active_umin_v2i16(<2 x i16> noundef %expr) { +entry: +; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr.i0, i8 2, i8 1){{$}} +; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr.i1, i8 2, i8 1){{$}} + %ret = call <2 x i16> @llvm.dx.wave.reduce.umin.v2f16(<2 x i16> %expr) + ret <2 x i16> %ret +} + +define noundef <3 x i32> @wave_active_umin_v3i32(<3 x i32> noundef %expr) { +entry: +; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i0, i8 2, i8 1){{$}} +; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i1, i8 2, i8 1){{$}} +; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i2, i8 2, i8 1){{$}} + %ret = call <3 x i32> @llvm.dx.wave.reduce.umin.v3i32(<3 x i32> %expr) + ret <3 x i32> %ret +} + +define noundef <4 x i64> @wave_active_umin_v4f64(<4 x i64> noundef %expr) { +entry: +; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i0, i8 2, i8 1){{$}} +; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i1, i8 2, i8 1){{$}} +; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i2, i8 2, i8 1){{$}} +; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i3, i8 2, i8 1){{$}} + %ret = call <4 x i64> @llvm.dx.wave.reduce.umin.v4f64(<4 x i64> %expr) + ret <4 x i64> %ret +} + +declare <2 x i16> @llvm.dx.wave.reduce.umin.v2f16(<2 x i16>) +declare <3 x i32> @llvm.dx.wave.reduce.umin.v3i32(<3 x i32>) +declare <4 x i64> @llvm.dx.wave.reduce.umin.v4f64(<4 x i64>) diff --git a/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll b/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll new file mode 100644 index 0000000..48ec98c --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll @@ -0,0 +1,160 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s + +define void @minnum_v8f32(ptr %res, ptr %x, ptr %y) nounwind { +; CHECK-LABEL: minnum_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvld $xr1, $a1, 0 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 5 +; CHECK-NEXT: xvpickve.w $xr3, $xr1, 5 +; CHECK-NEXT: fmin.s $fa2, $fa3, $fa2 +; CHECK-NEXT: xvpickve.w $xr3, $xr0, 4 +; CHECK-NEXT: xvpickve.w $xr4, $xr1, 4 +; CHECK-NEXT: fmin.s $fa3, $fa4, $fa3 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 16 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 6 +; CHECK-NEXT: xvpickve.w $xr4, $xr1, 6 +; CHECK-NEXT: fmin.s $fa2, $fa4, $fa2 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 32 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 7 +; CHECK-NEXT: xvpickve.w $xr4, $xr1, 7 +; CHECK-NEXT: fmin.s $fa2, $fa4, $fa2 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 48 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 1 +; CHECK-NEXT: xvpickve.w $xr4, $xr1, 1 +; CHECK-NEXT: fmin.s $fa2, $fa4, $fa2 +; CHECK-NEXT: xvpickve.w $xr4, $xr0, 0 +; CHECK-NEXT: xvpickve.w $xr5, $xr1, 0 +; CHECK-NEXT: fmin.s $fa4, $fa5, $fa4 +; CHECK-NEXT: vextrins.w $vr4, $vr2, 16 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 2 +; CHECK-NEXT: xvpickve.w $xr5, $xr1, 2 +; CHECK-NEXT: fmin.s $fa2, $fa5, $fa2 +; CHECK-NEXT: vextrins.w $vr4, $vr2, 32 +; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3 +; CHECK-NEXT: xvpickve.w $xr1, $xr1, 3 +; CHECK-NEXT: fmin.s $fa0, $fa1, $fa0 +; CHECK-NEXT: vextrins.w $vr4, $vr0, 48 +; CHECK-NEXT: xvpermi.q $xr4, $xr3, 2 +; CHECK-NEXT: xvst $xr4, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <8 x float>, ptr %x + %v1 = load <8 x float>, ptr %y + %r = call <8 x float> @llvm.minnum.v8f32(<8 x float> %v0, <8 x float> %v1) + store <8 x float> %r, ptr %res + ret void +} + +define void @minnum_v4f64(ptr %res, ptr %x, ptr %y) nounwind { +; CHECK-LABEL: minnum_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvld $xr1, $a1, 0 +; CHECK-NEXT: xvpickve.d $xr2, $xr0, 3 +; CHECK-NEXT: xvpickve.d $xr3, $xr1, 3 +; CHECK-NEXT: fmin.d $fa2, $fa3, $fa2 +; CHECK-NEXT: xvpickve.d $xr3, $xr0, 2 +; CHECK-NEXT: xvpickve.d $xr4, $xr1, 2 +; CHECK-NEXT: fmin.d $fa3, $fa4, $fa3 +; CHECK-NEXT: vextrins.d $vr3, $vr2, 16 +; CHECK-NEXT: xvpickve.d $xr2, $xr0, 1 +; CHECK-NEXT: xvpickve.d $xr4, $xr1, 1 +; CHECK-NEXT: fmin.d $fa2, $fa4, $fa2 +; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0 +; CHECK-NEXT: xvpickve.d $xr1, $xr1, 0 +; CHECK-NEXT: fmin.d $fa0, $fa1, $fa0 +; CHECK-NEXT: vextrins.d $vr0, $vr2, 16 +; CHECK-NEXT: xvpermi.q $xr0, $xr3, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x double>, ptr %x + %v1 = load <4 x double>, ptr %y + %r = call <4 x double> @llvm.minnum.v4f64(<4 x double> %v0, <4 x double> %v1) + store <4 x double> %r, ptr %res + ret void +} + +define void @maxnum_v8f32(ptr %res, ptr %x, ptr %y) nounwind { +; CHECK-LABEL: maxnum_v8f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvld $xr1, $a1, 0 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 5 +; CHECK-NEXT: xvpickve.w $xr3, $xr1, 5 +; CHECK-NEXT: fmax.s $fa2, $fa3, $fa2 +; CHECK-NEXT: xvpickve.w $xr3, $xr0, 4 +; CHECK-NEXT: xvpickve.w $xr4, $xr1, 4 +; CHECK-NEXT: fmax.s $fa3, $fa4, $fa3 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 16 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 6 +; CHECK-NEXT: xvpickve.w $xr4, $xr1, 6 +; CHECK-NEXT: fmax.s $fa2, $fa4, $fa2 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 32 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 7 +; CHECK-NEXT: xvpickve.w $xr4, $xr1, 7 +; CHECK-NEXT: fmax.s $fa2, $fa4, $fa2 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 48 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 1 +; CHECK-NEXT: xvpickve.w $xr4, $xr1, 1 +; CHECK-NEXT: fmax.s $fa2, $fa4, $fa2 +; CHECK-NEXT: xvpickve.w $xr4, $xr0, 0 +; CHECK-NEXT: xvpickve.w $xr5, $xr1, 0 +; CHECK-NEXT: fmax.s $fa4, $fa5, $fa4 +; CHECK-NEXT: vextrins.w $vr4, $vr2, 16 +; CHECK-NEXT: xvpickve.w $xr2, $xr0, 2 +; CHECK-NEXT: xvpickve.w $xr5, $xr1, 2 +; CHECK-NEXT: fmax.s $fa2, $fa5, $fa2 +; CHECK-NEXT: vextrins.w $vr4, $vr2, 32 +; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3 +; CHECK-NEXT: xvpickve.w $xr1, $xr1, 3 +; CHECK-NEXT: fmax.s $fa0, $fa1, $fa0 +; CHECK-NEXT: vextrins.w $vr4, $vr0, 48 +; CHECK-NEXT: xvpermi.q $xr4, $xr3, 2 +; CHECK-NEXT: xvst $xr4, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <8 x float>, ptr %x + %v1 = load <8 x float>, ptr %y + %r = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %v0, <8 x float> %v1) + store <8 x float> %r, ptr %res + ret void +} + +define void @maxnum_v4f64(ptr %res, ptr %x, ptr %y) nounwind { +; CHECK-LABEL: maxnum_v4f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: xvld $xr0, $a2, 0 +; CHECK-NEXT: xvld $xr1, $a1, 0 +; CHECK-NEXT: xvpickve.d $xr2, $xr0, 3 +; CHECK-NEXT: xvpickve.d $xr3, $xr1, 3 +; CHECK-NEXT: fmax.d $fa2, $fa3, $fa2 +; CHECK-NEXT: xvpickve.d $xr3, $xr0, 2 +; CHECK-NEXT: xvpickve.d $xr4, $xr1, 2 +; CHECK-NEXT: fmax.d $fa3, $fa4, $fa3 +; CHECK-NEXT: vextrins.d $vr3, $vr2, 16 +; CHECK-NEXT: xvpickve.d $xr2, $xr0, 1 +; CHECK-NEXT: xvpickve.d $xr4, $xr1, 1 +; CHECK-NEXT: fmax.d $fa2, $fa4, $fa2 +; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0 +; CHECK-NEXT: xvpickve.d $xr1, $xr1, 0 +; CHECK-NEXT: fmax.d $fa0, $fa1, $fa0 +; CHECK-NEXT: vextrins.d $vr0, $vr2, 16 +; CHECK-NEXT: xvpermi.q $xr0, $xr3, 2 +; CHECK-NEXT: xvst $xr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x double>, ptr %x + %v1 = load <4 x double>, ptr %y + %r = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %v0, <4 x double> %v1) + store <4 x double> %r, ptr %res + ret void +} + +declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>) +declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>) +declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>) +declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>) diff --git a/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll b/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll new file mode 100644 index 0000000..27ecb75 --- /dev/null +++ b/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll @@ -0,0 +1,112 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6 +; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s +; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s + +define void @minnum_v4f32(ptr %res, ptr %x, ptr %y) nounwind { +; CHECK-LABEL: minnum_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a2, 0 +; CHECK-NEXT: vld $vr1, $a1, 0 +; CHECK-NEXT: vreplvei.w $vr2, $vr0, 1 +; CHECK-NEXT: vreplvei.w $vr3, $vr1, 1 +; CHECK-NEXT: fmin.s $fa2, $fa3, $fa2 +; CHECK-NEXT: vreplvei.w $vr3, $vr0, 0 +; CHECK-NEXT: vreplvei.w $vr4, $vr1, 0 +; CHECK-NEXT: fmin.s $fa3, $fa4, $fa3 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 16 +; CHECK-NEXT: vreplvei.w $vr2, $vr0, 2 +; CHECK-NEXT: vreplvei.w $vr4, $vr1, 2 +; CHECK-NEXT: fmin.s $fa2, $fa4, $fa2 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 32 +; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 3 +; CHECK-NEXT: fmin.s $fa0, $fa1, $fa0 +; CHECK-NEXT: vextrins.w $vr3, $vr0, 48 +; CHECK-NEXT: vst $vr3, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x float>, ptr %x + %v1 = load <4 x float>, ptr %y + %r = call <4 x float> @llvm.minnum.v4f32(<4 x float> %v0, <4 x float> %v1) + store <4 x float> %r, ptr %res + ret void +} + +define void @minnum_v2f64(ptr %res, ptr %x, ptr %y) nounwind { +; CHECK-LABEL: minnum_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a2, 0 +; CHECK-NEXT: vld $vr1, $a1, 0 +; CHECK-NEXT: vreplvei.d $vr2, $vr0, 1 +; CHECK-NEXT: vreplvei.d $vr3, $vr1, 1 +; CHECK-NEXT: fmin.d $fa2, $fa3, $fa2 +; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 +; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 +; CHECK-NEXT: fmin.d $fa0, $fa1, $fa0 +; CHECK-NEXT: vextrins.d $vr0, $vr2, 16 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <2 x double>, ptr %x + %v1 = load <2 x double>, ptr %y + %r = call <2 x double> @llvm.minnum.v2f64(<2 x double> %v0, <2 x double> %v1) + store <2 x double> %r, ptr %res + ret void +} + +define void @maxnum_v4f32(ptr %res, ptr %x, ptr %y) nounwind { +; CHECK-LABEL: maxnum_v4f32: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a2, 0 +; CHECK-NEXT: vld $vr1, $a1, 0 +; CHECK-NEXT: vreplvei.w $vr2, $vr0, 1 +; CHECK-NEXT: vreplvei.w $vr3, $vr1, 1 +; CHECK-NEXT: fmax.s $fa2, $fa3, $fa2 +; CHECK-NEXT: vreplvei.w $vr3, $vr0, 0 +; CHECK-NEXT: vreplvei.w $vr4, $vr1, 0 +; CHECK-NEXT: fmax.s $fa3, $fa4, $fa3 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 16 +; CHECK-NEXT: vreplvei.w $vr2, $vr0, 2 +; CHECK-NEXT: vreplvei.w $vr4, $vr1, 2 +; CHECK-NEXT: fmax.s $fa2, $fa4, $fa2 +; CHECK-NEXT: vextrins.w $vr3, $vr2, 32 +; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3 +; CHECK-NEXT: vreplvei.w $vr1, $vr1, 3 +; CHECK-NEXT: fmax.s $fa0, $fa1, $fa0 +; CHECK-NEXT: vextrins.w $vr3, $vr0, 48 +; CHECK-NEXT: vst $vr3, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <4 x float>, ptr %x + %v1 = load <4 x float>, ptr %y + %r = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %v0, <4 x float> %v1) + store <4 x float> %r, ptr %res + ret void +} + +define void @maxnum_v2f64(ptr %res, ptr %x, ptr %y) nounwind { +; CHECK-LABEL: maxnum_v2f64: +; CHECK: # %bb.0: # %entry +; CHECK-NEXT: vld $vr0, $a2, 0 +; CHECK-NEXT: vld $vr1, $a1, 0 +; CHECK-NEXT: vreplvei.d $vr2, $vr0, 1 +; CHECK-NEXT: vreplvei.d $vr3, $vr1, 1 +; CHECK-NEXT: fmax.d $fa2, $fa3, $fa2 +; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0 +; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0 +; CHECK-NEXT: fmax.d $fa0, $fa1, $fa0 +; CHECK-NEXT: vextrins.d $vr0, $vr2, 16 +; CHECK-NEXT: vst $vr0, $a0, 0 +; CHECK-NEXT: ret +entry: + %v0 = load <2 x double>, ptr %x + %v1 = load <2 x double>, ptr %y + %r = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %v0, <2 x double> %v1) + store <2 x double> %r, ptr %res + ret void +} + +declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>) +declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>) +declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>) diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw-minmax.ll b/llvm/test/CodeGen/RISCV/atomic-rmw-minmax.ll new file mode 100644 index 0000000..b43555c6 --- /dev/null +++ b/llvm/test/CodeGen/RISCV/atomic-rmw-minmax.ll @@ -0,0 +1,642 @@ +; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py +; RUN: llc -mtriple=riscv32 -mattr=+b,+zalrsc -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=RV32IB-COMMON,RV32IB-ZALRSC %s +; RUN: llc -mtriple=riscv32 -mattr=+b,+zalrsc,+permissive-zalrsc -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=RV32IB-COMMON,RV32IB-ZALRSC-PERM %s +; RUN: llc -mtriple=riscv32 -mattr=+b,+zalrsc,+permissive-zalrsc,+a -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=RV32IB-COMMON,RV32IAB %s +; +; RUN: llc -mtriple=riscv64 -mattr=+b,+zalrsc -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=RV64IB-ZALRSC %s +; RUN: llc -mtriple=riscv64 -mattr=+b,+zalrsc,+permissive-zalrsc -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=RV64IB-ZALRSC-PERM %s +; RUN: llc -mtriple=riscv64 -mattr=+b,+zalrsc,+permissive-zalrsc,+a -verify-machineinstrs < %s \ +; RUN: | FileCheck -check-prefixes=RV64IAB %s + +define i32 @atomicrmw_max_i32_seq_cst(ptr %a, i32 %b) nounwind { +; RV32IB-ZALRSC-LABEL: atomicrmw_max_i32_seq_cst: +; RV32IB-ZALRSC: # %bb.0: +; RV32IB-ZALRSC-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 +; RV32IB-ZALRSC-NEXT: lr.w.aqrl a2, (a0) +; RV32IB-ZALRSC-NEXT: mv a3, a2 +; RV32IB-ZALRSC-NEXT: bge a3, a1, .LBB0_3 +; RV32IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB0_1 Depth=1 +; RV32IB-ZALRSC-NEXT: mv a3, a1 +; RV32IB-ZALRSC-NEXT: .LBB0_3: # in Loop: Header=BB0_1 Depth=1 +; RV32IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0) +; RV32IB-ZALRSC-NEXT: bnez a3, .LBB0_1 +; RV32IB-ZALRSC-NEXT: # %bb.4: +; RV32IB-ZALRSC-NEXT: mv a0, a2 +; RV32IB-ZALRSC-NEXT: ret +; +; RV32IB-ZALRSC-PERM-LABEL: atomicrmw_max_i32_seq_cst: +; RV32IB-ZALRSC-PERM: # %bb.0: +; RV32IB-ZALRSC-PERM-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 +; RV32IB-ZALRSC-PERM-NEXT: lr.w.aqrl a2, (a0) +; RV32IB-ZALRSC-PERM-NEXT: max a3, a2, a1 +; RV32IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0) +; RV32IB-ZALRSC-PERM-NEXT: bnez a3, .LBB0_1 +; RV32IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV32IB-ZALRSC-PERM-NEXT: mv a0, a2 +; RV32IB-ZALRSC-PERM-NEXT: ret +; +; RV32IAB-LABEL: atomicrmw_max_i32_seq_cst: +; RV32IAB: # %bb.0: +; RV32IAB-NEXT: amomax.w.aqrl a0, a1, (a0) +; RV32IAB-NEXT: ret +; +; RV64IB-ZALRSC-LABEL: atomicrmw_max_i32_seq_cst: +; RV64IB-ZALRSC: # %bb.0: +; RV64IB-ZALRSC-NEXT: sext.w a2, a1 +; RV64IB-ZALRSC-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-NEXT: lr.w.aqrl a1, (a0) +; RV64IB-ZALRSC-NEXT: mv a3, a1 +; RV64IB-ZALRSC-NEXT: bge a3, a2, .LBB0_3 +; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB0_1 Depth=1 +; RV64IB-ZALRSC-NEXT: mv a3, a2 +; RV64IB-ZALRSC-NEXT: .LBB0_3: # in Loop: Header=BB0_1 Depth=1 +; RV64IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0) +; RV64IB-ZALRSC-NEXT: bnez a3, .LBB0_1 +; RV64IB-ZALRSC-NEXT: # %bb.4: +; RV64IB-ZALRSC-NEXT: mv a0, a1 +; RV64IB-ZALRSC-NEXT: ret +; +; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_max_i32_seq_cst: +; RV64IB-ZALRSC-PERM: # %bb.0: +; RV64IB-ZALRSC-PERM-NEXT: sext.w a2, a1 +; RV64IB-ZALRSC-PERM-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-PERM-NEXT: lr.w.aqrl a1, (a0) +; RV64IB-ZALRSC-PERM-NEXT: max a3, a1, a2 +; RV64IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0) +; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB0_1 +; RV64IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV64IB-ZALRSC-PERM-NEXT: mv a0, a1 +; RV64IB-ZALRSC-PERM-NEXT: ret +; +; RV64IAB-LABEL: atomicrmw_max_i32_seq_cst: +; RV64IAB: # %bb.0: +; RV64IAB-NEXT: amomax.w.aqrl a0, a1, (a0) +; RV64IAB-NEXT: ret + %1 = atomicrmw max ptr %a, i32 %b seq_cst + ret i32 %1 +} + +define i32 @atomicrmw_min_i32_seq_cst(ptr %a, i32 %b) nounwind { +; RV32IB-ZALRSC-LABEL: atomicrmw_min_i32_seq_cst: +; RV32IB-ZALRSC: # %bb.0: +; RV32IB-ZALRSC-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 +; RV32IB-ZALRSC-NEXT: lr.w.aqrl a2, (a0) +; RV32IB-ZALRSC-NEXT: mv a3, a2 +; RV32IB-ZALRSC-NEXT: bge a1, a3, .LBB1_3 +; RV32IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB1_1 Depth=1 +; RV32IB-ZALRSC-NEXT: mv a3, a1 +; RV32IB-ZALRSC-NEXT: .LBB1_3: # in Loop: Header=BB1_1 Depth=1 +; RV32IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0) +; RV32IB-ZALRSC-NEXT: bnez a3, .LBB1_1 +; RV32IB-ZALRSC-NEXT: # %bb.4: +; RV32IB-ZALRSC-NEXT: mv a0, a2 +; RV32IB-ZALRSC-NEXT: ret +; +; RV32IB-ZALRSC-PERM-LABEL: atomicrmw_min_i32_seq_cst: +; RV32IB-ZALRSC-PERM: # %bb.0: +; RV32IB-ZALRSC-PERM-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 +; RV32IB-ZALRSC-PERM-NEXT: lr.w.aqrl a2, (a0) +; RV32IB-ZALRSC-PERM-NEXT: min a3, a2, a1 +; RV32IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0) +; RV32IB-ZALRSC-PERM-NEXT: bnez a3, .LBB1_1 +; RV32IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV32IB-ZALRSC-PERM-NEXT: mv a0, a2 +; RV32IB-ZALRSC-PERM-NEXT: ret +; +; RV32IAB-LABEL: atomicrmw_min_i32_seq_cst: +; RV32IAB: # %bb.0: +; RV32IAB-NEXT: amomin.w.aqrl a0, a1, (a0) +; RV32IAB-NEXT: ret +; +; RV64IB-ZALRSC-LABEL: atomicrmw_min_i32_seq_cst: +; RV64IB-ZALRSC: # %bb.0: +; RV64IB-ZALRSC-NEXT: sext.w a2, a1 +; RV64IB-ZALRSC-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-NEXT: lr.w.aqrl a1, (a0) +; RV64IB-ZALRSC-NEXT: mv a3, a1 +; RV64IB-ZALRSC-NEXT: bge a2, a3, .LBB1_3 +; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB1_1 Depth=1 +; RV64IB-ZALRSC-NEXT: mv a3, a2 +; RV64IB-ZALRSC-NEXT: .LBB1_3: # in Loop: Header=BB1_1 Depth=1 +; RV64IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0) +; RV64IB-ZALRSC-NEXT: bnez a3, .LBB1_1 +; RV64IB-ZALRSC-NEXT: # %bb.4: +; RV64IB-ZALRSC-NEXT: mv a0, a1 +; RV64IB-ZALRSC-NEXT: ret +; +; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_min_i32_seq_cst: +; RV64IB-ZALRSC-PERM: # %bb.0: +; RV64IB-ZALRSC-PERM-NEXT: sext.w a2, a1 +; RV64IB-ZALRSC-PERM-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-PERM-NEXT: lr.w.aqrl a1, (a0) +; RV64IB-ZALRSC-PERM-NEXT: min a3, a1, a2 +; RV64IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0) +; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB1_1 +; RV64IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV64IB-ZALRSC-PERM-NEXT: mv a0, a1 +; RV64IB-ZALRSC-PERM-NEXT: ret +; +; RV64IAB-LABEL: atomicrmw_min_i32_seq_cst: +; RV64IAB: # %bb.0: +; RV64IAB-NEXT: amomin.w.aqrl a0, a1, (a0) +; RV64IAB-NEXT: ret + %1 = atomicrmw min ptr %a, i32 %b seq_cst + ret i32 %1 +} + +define i32 @atomicrmw_umax_i32_seq_cst(ptr %a, i32 %b) nounwind { +; RV32IB-ZALRSC-LABEL: atomicrmw_umax_i32_seq_cst: +; RV32IB-ZALRSC: # %bb.0: +; RV32IB-ZALRSC-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 +; RV32IB-ZALRSC-NEXT: lr.w.aqrl a2, (a0) +; RV32IB-ZALRSC-NEXT: mv a3, a2 +; RV32IB-ZALRSC-NEXT: bgeu a3, a1, .LBB2_3 +; RV32IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB2_1 Depth=1 +; RV32IB-ZALRSC-NEXT: mv a3, a1 +; RV32IB-ZALRSC-NEXT: .LBB2_3: # in Loop: Header=BB2_1 Depth=1 +; RV32IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0) +; RV32IB-ZALRSC-NEXT: bnez a3, .LBB2_1 +; RV32IB-ZALRSC-NEXT: # %bb.4: +; RV32IB-ZALRSC-NEXT: mv a0, a2 +; RV32IB-ZALRSC-NEXT: ret +; +; RV32IB-ZALRSC-PERM-LABEL: atomicrmw_umax_i32_seq_cst: +; RV32IB-ZALRSC-PERM: # %bb.0: +; RV32IB-ZALRSC-PERM-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 +; RV32IB-ZALRSC-PERM-NEXT: lr.w.aqrl a2, (a0) +; RV32IB-ZALRSC-PERM-NEXT: maxu a3, a2, a1 +; RV32IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0) +; RV32IB-ZALRSC-PERM-NEXT: bnez a3, .LBB2_1 +; RV32IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV32IB-ZALRSC-PERM-NEXT: mv a0, a2 +; RV32IB-ZALRSC-PERM-NEXT: ret +; +; RV32IAB-LABEL: atomicrmw_umax_i32_seq_cst: +; RV32IAB: # %bb.0: +; RV32IAB-NEXT: amomaxu.w.aqrl a0, a1, (a0) +; RV32IAB-NEXT: ret +; +; RV64IB-ZALRSC-LABEL: atomicrmw_umax_i32_seq_cst: +; RV64IB-ZALRSC: # %bb.0: +; RV64IB-ZALRSC-NEXT: sext.w a2, a1 +; RV64IB-ZALRSC-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-NEXT: lr.w.aqrl a1, (a0) +; RV64IB-ZALRSC-NEXT: mv a3, a1 +; RV64IB-ZALRSC-NEXT: bgeu a3, a2, .LBB2_3 +; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB2_1 Depth=1 +; RV64IB-ZALRSC-NEXT: mv a3, a2 +; RV64IB-ZALRSC-NEXT: .LBB2_3: # in Loop: Header=BB2_1 Depth=1 +; RV64IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0) +; RV64IB-ZALRSC-NEXT: bnez a3, .LBB2_1 +; RV64IB-ZALRSC-NEXT: # %bb.4: +; RV64IB-ZALRSC-NEXT: mv a0, a1 +; RV64IB-ZALRSC-NEXT: ret +; +; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_umax_i32_seq_cst: +; RV64IB-ZALRSC-PERM: # %bb.0: +; RV64IB-ZALRSC-PERM-NEXT: sext.w a2, a1 +; RV64IB-ZALRSC-PERM-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-PERM-NEXT: lr.w.aqrl a1, (a0) +; RV64IB-ZALRSC-PERM-NEXT: maxu a3, a1, a2 +; RV64IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0) +; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB2_1 +; RV64IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV64IB-ZALRSC-PERM-NEXT: mv a0, a1 +; RV64IB-ZALRSC-PERM-NEXT: ret +; +; RV64IAB-LABEL: atomicrmw_umax_i32_seq_cst: +; RV64IAB: # %bb.0: +; RV64IAB-NEXT: amomaxu.w.aqrl a0, a1, (a0) +; RV64IAB-NEXT: ret + %1 = atomicrmw umax ptr %a, i32 %b seq_cst + ret i32 %1 +} + +define i32 @atomicrmw_umin_i32_seq_cst(ptr %a, i32 %b) nounwind { +; RV32IB-ZALRSC-LABEL: atomicrmw_umin_i32_seq_cst: +; RV32IB-ZALRSC: # %bb.0: +; RV32IB-ZALRSC-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 +; RV32IB-ZALRSC-NEXT: lr.w.aqrl a2, (a0) +; RV32IB-ZALRSC-NEXT: mv a3, a2 +; RV32IB-ZALRSC-NEXT: bgeu a1, a3, .LBB3_3 +; RV32IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1 +; RV32IB-ZALRSC-NEXT: mv a3, a1 +; RV32IB-ZALRSC-NEXT: .LBB3_3: # in Loop: Header=BB3_1 Depth=1 +; RV32IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0) +; RV32IB-ZALRSC-NEXT: bnez a3, .LBB3_1 +; RV32IB-ZALRSC-NEXT: # %bb.4: +; RV32IB-ZALRSC-NEXT: mv a0, a2 +; RV32IB-ZALRSC-NEXT: ret +; +; RV32IB-ZALRSC-PERM-LABEL: atomicrmw_umin_i32_seq_cst: +; RV32IB-ZALRSC-PERM: # %bb.0: +; RV32IB-ZALRSC-PERM-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 +; RV32IB-ZALRSC-PERM-NEXT: lr.w.aqrl a2, (a0) +; RV32IB-ZALRSC-PERM-NEXT: minu a3, a2, a1 +; RV32IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0) +; RV32IB-ZALRSC-PERM-NEXT: bnez a3, .LBB3_1 +; RV32IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV32IB-ZALRSC-PERM-NEXT: mv a0, a2 +; RV32IB-ZALRSC-PERM-NEXT: ret +; +; RV32IAB-LABEL: atomicrmw_umin_i32_seq_cst: +; RV32IAB: # %bb.0: +; RV32IAB-NEXT: amominu.w.aqrl a0, a1, (a0) +; RV32IAB-NEXT: ret +; +; RV64IB-ZALRSC-LABEL: atomicrmw_umin_i32_seq_cst: +; RV64IB-ZALRSC: # %bb.0: +; RV64IB-ZALRSC-NEXT: sext.w a2, a1 +; RV64IB-ZALRSC-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-NEXT: lr.w.aqrl a1, (a0) +; RV64IB-ZALRSC-NEXT: mv a3, a1 +; RV64IB-ZALRSC-NEXT: bgeu a2, a3, .LBB3_3 +; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1 +; RV64IB-ZALRSC-NEXT: mv a3, a2 +; RV64IB-ZALRSC-NEXT: .LBB3_3: # in Loop: Header=BB3_1 Depth=1 +; RV64IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0) +; RV64IB-ZALRSC-NEXT: bnez a3, .LBB3_1 +; RV64IB-ZALRSC-NEXT: # %bb.4: +; RV64IB-ZALRSC-NEXT: mv a0, a1 +; RV64IB-ZALRSC-NEXT: ret +; +; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_umin_i32_seq_cst: +; RV64IB-ZALRSC-PERM: # %bb.0: +; RV64IB-ZALRSC-PERM-NEXT: sext.w a2, a1 +; RV64IB-ZALRSC-PERM-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-PERM-NEXT: lr.w.aqrl a1, (a0) +; RV64IB-ZALRSC-PERM-NEXT: minu a3, a1, a2 +; RV64IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0) +; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB3_1 +; RV64IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV64IB-ZALRSC-PERM-NEXT: mv a0, a1 +; RV64IB-ZALRSC-PERM-NEXT: ret +; +; RV64IAB-LABEL: atomicrmw_umin_i32_seq_cst: +; RV64IAB: # %bb.0: +; RV64IAB-NEXT: amominu.w.aqrl a0, a1, (a0) +; RV64IAB-NEXT: ret + %1 = atomicrmw umin ptr %a, i32 %b seq_cst + ret i32 %1 +} + +define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind { +; RV32IB-COMMON-LABEL: atomicrmw_max_i64_seq_cst: +; RV32IB-COMMON: # %bb.0: +; RV32IB-COMMON-NEXT: addi sp, sp, -32 +; RV32IB-COMMON-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s1, 20(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s2, 16(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: mv s0, a2 +; RV32IB-COMMON-NEXT: mv s1, a0 +; RV32IB-COMMON-NEXT: lw a4, 0(a0) +; RV32IB-COMMON-NEXT: lw a5, 4(a0) +; RV32IB-COMMON-NEXT: mv s2, a1 +; RV32IB-COMMON-NEXT: j .LBB4_2 +; RV32IB-COMMON-NEXT: .LBB4_1: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB4_2 Depth=1 +; RV32IB-COMMON-NEXT: sw a4, 8(sp) +; RV32IB-COMMON-NEXT: sw a5, 12(sp) +; RV32IB-COMMON-NEXT: addi a1, sp, 8 +; RV32IB-COMMON-NEXT: li a4, 5 +; RV32IB-COMMON-NEXT: li a5, 5 +; RV32IB-COMMON-NEXT: mv a0, s1 +; RV32IB-COMMON-NEXT: call __atomic_compare_exchange_8 +; RV32IB-COMMON-NEXT: lw a4, 8(sp) +; RV32IB-COMMON-NEXT: lw a5, 12(sp) +; RV32IB-COMMON-NEXT: bnez a0, .LBB4_7 +; RV32IB-COMMON-NEXT: .LBB4_2: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IB-COMMON-NEXT: beq a5, s0, .LBB4_4 +; RV32IB-COMMON-NEXT: # %bb.3: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB4_2 Depth=1 +; RV32IB-COMMON-NEXT: slt a0, s0, a5 +; RV32IB-COMMON-NEXT: j .LBB4_5 +; RV32IB-COMMON-NEXT: .LBB4_4: # in Loop: Header=BB4_2 Depth=1 +; RV32IB-COMMON-NEXT: sltu a0, s2, a4 +; RV32IB-COMMON-NEXT: .LBB4_5: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB4_2 Depth=1 +; RV32IB-COMMON-NEXT: mv a2, a4 +; RV32IB-COMMON-NEXT: mv a3, a5 +; RV32IB-COMMON-NEXT: bnez a0, .LBB4_1 +; RV32IB-COMMON-NEXT: # %bb.6: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB4_2 Depth=1 +; RV32IB-COMMON-NEXT: mv a2, s2 +; RV32IB-COMMON-NEXT: mv a3, s0 +; RV32IB-COMMON-NEXT: j .LBB4_1 +; RV32IB-COMMON-NEXT: .LBB4_7: # %atomicrmw.end +; RV32IB-COMMON-NEXT: mv a0, a4 +; RV32IB-COMMON-NEXT: mv a1, a5 +; RV32IB-COMMON-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s2, 16(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: addi sp, sp, 32 +; RV32IB-COMMON-NEXT: ret +; +; RV64IB-ZALRSC-LABEL: atomicrmw_max_i64_seq_cst: +; RV64IB-ZALRSC: # %bb.0: +; RV64IB-ZALRSC-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-NEXT: lr.d.aqrl a2, (a0) +; RV64IB-ZALRSC-NEXT: mv a3, a2 +; RV64IB-ZALRSC-NEXT: bge a3, a1, .LBB4_3 +; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB4_1 Depth=1 +; RV64IB-ZALRSC-NEXT: mv a3, a1 +; RV64IB-ZALRSC-NEXT: .LBB4_3: # in Loop: Header=BB4_1 Depth=1 +; RV64IB-ZALRSC-NEXT: sc.d.rl a3, a3, (a0) +; RV64IB-ZALRSC-NEXT: bnez a3, .LBB4_1 +; RV64IB-ZALRSC-NEXT: # %bb.4: +; RV64IB-ZALRSC-NEXT: mv a0, a2 +; RV64IB-ZALRSC-NEXT: ret +; +; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_max_i64_seq_cst: +; RV64IB-ZALRSC-PERM: # %bb.0: +; RV64IB-ZALRSC-PERM-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-PERM-NEXT: lr.d.aqrl a2, (a0) +; RV64IB-ZALRSC-PERM-NEXT: max a3, a2, a1 +; RV64IB-ZALRSC-PERM-NEXT: sc.d.rl a3, a3, (a0) +; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB4_1 +; RV64IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV64IB-ZALRSC-PERM-NEXT: mv a0, a2 +; RV64IB-ZALRSC-PERM-NEXT: ret +; +; RV64IAB-LABEL: atomicrmw_max_i64_seq_cst: +; RV64IAB: # %bb.0: +; RV64IAB-NEXT: amomax.d.aqrl a0, a1, (a0) +; RV64IAB-NEXT: ret + %1 = atomicrmw max ptr %a, i64 %b seq_cst + ret i64 %1 +} + +define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind { +; RV32IB-COMMON-LABEL: atomicrmw_min_i64_seq_cst: +; RV32IB-COMMON: # %bb.0: +; RV32IB-COMMON-NEXT: addi sp, sp, -32 +; RV32IB-COMMON-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s1, 20(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s2, 16(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: mv s0, a2 +; RV32IB-COMMON-NEXT: mv s1, a0 +; RV32IB-COMMON-NEXT: lw a4, 0(a0) +; RV32IB-COMMON-NEXT: lw a5, 4(a0) +; RV32IB-COMMON-NEXT: mv s2, a1 +; RV32IB-COMMON-NEXT: j .LBB5_2 +; RV32IB-COMMON-NEXT: .LBB5_1: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB5_2 Depth=1 +; RV32IB-COMMON-NEXT: sw a4, 8(sp) +; RV32IB-COMMON-NEXT: sw a5, 12(sp) +; RV32IB-COMMON-NEXT: addi a1, sp, 8 +; RV32IB-COMMON-NEXT: li a4, 5 +; RV32IB-COMMON-NEXT: li a5, 5 +; RV32IB-COMMON-NEXT: mv a0, s1 +; RV32IB-COMMON-NEXT: call __atomic_compare_exchange_8 +; RV32IB-COMMON-NEXT: lw a4, 8(sp) +; RV32IB-COMMON-NEXT: lw a5, 12(sp) +; RV32IB-COMMON-NEXT: bnez a0, .LBB5_7 +; RV32IB-COMMON-NEXT: .LBB5_2: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IB-COMMON-NEXT: beq a5, s0, .LBB5_4 +; RV32IB-COMMON-NEXT: # %bb.3: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB5_2 Depth=1 +; RV32IB-COMMON-NEXT: slt a0, a5, s0 +; RV32IB-COMMON-NEXT: j .LBB5_5 +; RV32IB-COMMON-NEXT: .LBB5_4: # in Loop: Header=BB5_2 Depth=1 +; RV32IB-COMMON-NEXT: sltu a0, a4, s2 +; RV32IB-COMMON-NEXT: .LBB5_5: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB5_2 Depth=1 +; RV32IB-COMMON-NEXT: mv a2, a4 +; RV32IB-COMMON-NEXT: mv a3, a5 +; RV32IB-COMMON-NEXT: bnez a0, .LBB5_1 +; RV32IB-COMMON-NEXT: # %bb.6: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB5_2 Depth=1 +; RV32IB-COMMON-NEXT: mv a2, s2 +; RV32IB-COMMON-NEXT: mv a3, s0 +; RV32IB-COMMON-NEXT: j .LBB5_1 +; RV32IB-COMMON-NEXT: .LBB5_7: # %atomicrmw.end +; RV32IB-COMMON-NEXT: mv a0, a4 +; RV32IB-COMMON-NEXT: mv a1, a5 +; RV32IB-COMMON-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s2, 16(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: addi sp, sp, 32 +; RV32IB-COMMON-NEXT: ret +; +; RV64IB-ZALRSC-LABEL: atomicrmw_min_i64_seq_cst: +; RV64IB-ZALRSC: # %bb.0: +; RV64IB-ZALRSC-NEXT: .LBB5_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-NEXT: lr.d.aqrl a2, (a0) +; RV64IB-ZALRSC-NEXT: mv a3, a2 +; RV64IB-ZALRSC-NEXT: bge a1, a3, .LBB5_3 +; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB5_1 Depth=1 +; RV64IB-ZALRSC-NEXT: mv a3, a1 +; RV64IB-ZALRSC-NEXT: .LBB5_3: # in Loop: Header=BB5_1 Depth=1 +; RV64IB-ZALRSC-NEXT: sc.d.rl a3, a3, (a0) +; RV64IB-ZALRSC-NEXT: bnez a3, .LBB5_1 +; RV64IB-ZALRSC-NEXT: # %bb.4: +; RV64IB-ZALRSC-NEXT: mv a0, a2 +; RV64IB-ZALRSC-NEXT: ret +; +; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_min_i64_seq_cst: +; RV64IB-ZALRSC-PERM: # %bb.0: +; RV64IB-ZALRSC-PERM-NEXT: .LBB5_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-PERM-NEXT: lr.d.aqrl a2, (a0) +; RV64IB-ZALRSC-PERM-NEXT: min a3, a2, a1 +; RV64IB-ZALRSC-PERM-NEXT: sc.d.rl a3, a3, (a0) +; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB5_1 +; RV64IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV64IB-ZALRSC-PERM-NEXT: mv a0, a2 +; RV64IB-ZALRSC-PERM-NEXT: ret +; +; RV64IAB-LABEL: atomicrmw_min_i64_seq_cst: +; RV64IAB: # %bb.0: +; RV64IAB-NEXT: amomin.d.aqrl a0, a1, (a0) +; RV64IAB-NEXT: ret + %1 = atomicrmw min ptr %a, i64 %b seq_cst + ret i64 %1 +} + +define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind { +; RV32IB-COMMON-LABEL: atomicrmw_umax_i64_seq_cst: +; RV32IB-COMMON: # %bb.0: +; RV32IB-COMMON-NEXT: addi sp, sp, -32 +; RV32IB-COMMON-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s1, 20(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s2, 16(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: mv s0, a2 +; RV32IB-COMMON-NEXT: mv s1, a0 +; RV32IB-COMMON-NEXT: lw a4, 0(a0) +; RV32IB-COMMON-NEXT: lw a5, 4(a0) +; RV32IB-COMMON-NEXT: mv s2, a1 +; RV32IB-COMMON-NEXT: j .LBB6_2 +; RV32IB-COMMON-NEXT: .LBB6_1: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB6_2 Depth=1 +; RV32IB-COMMON-NEXT: sw a4, 8(sp) +; RV32IB-COMMON-NEXT: sw a5, 12(sp) +; RV32IB-COMMON-NEXT: addi a1, sp, 8 +; RV32IB-COMMON-NEXT: li a4, 5 +; RV32IB-COMMON-NEXT: li a5, 5 +; RV32IB-COMMON-NEXT: mv a0, s1 +; RV32IB-COMMON-NEXT: call __atomic_compare_exchange_8 +; RV32IB-COMMON-NEXT: lw a4, 8(sp) +; RV32IB-COMMON-NEXT: lw a5, 12(sp) +; RV32IB-COMMON-NEXT: bnez a0, .LBB6_7 +; RV32IB-COMMON-NEXT: .LBB6_2: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IB-COMMON-NEXT: beq a5, s0, .LBB6_4 +; RV32IB-COMMON-NEXT: # %bb.3: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB6_2 Depth=1 +; RV32IB-COMMON-NEXT: sltu a0, s0, a5 +; RV32IB-COMMON-NEXT: j .LBB6_5 +; RV32IB-COMMON-NEXT: .LBB6_4: # in Loop: Header=BB6_2 Depth=1 +; RV32IB-COMMON-NEXT: sltu a0, s2, a4 +; RV32IB-COMMON-NEXT: .LBB6_5: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB6_2 Depth=1 +; RV32IB-COMMON-NEXT: mv a2, a4 +; RV32IB-COMMON-NEXT: mv a3, a5 +; RV32IB-COMMON-NEXT: bnez a0, .LBB6_1 +; RV32IB-COMMON-NEXT: # %bb.6: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB6_2 Depth=1 +; RV32IB-COMMON-NEXT: mv a2, s2 +; RV32IB-COMMON-NEXT: mv a3, s0 +; RV32IB-COMMON-NEXT: j .LBB6_1 +; RV32IB-COMMON-NEXT: .LBB6_7: # %atomicrmw.end +; RV32IB-COMMON-NEXT: mv a0, a4 +; RV32IB-COMMON-NEXT: mv a1, a5 +; RV32IB-COMMON-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s2, 16(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: addi sp, sp, 32 +; RV32IB-COMMON-NEXT: ret +; +; RV64IB-ZALRSC-LABEL: atomicrmw_umax_i64_seq_cst: +; RV64IB-ZALRSC: # %bb.0: +; RV64IB-ZALRSC-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-NEXT: lr.d.aqrl a2, (a0) +; RV64IB-ZALRSC-NEXT: mv a3, a2 +; RV64IB-ZALRSC-NEXT: bgeu a3, a1, .LBB6_3 +; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB6_1 Depth=1 +; RV64IB-ZALRSC-NEXT: mv a3, a1 +; RV64IB-ZALRSC-NEXT: .LBB6_3: # in Loop: Header=BB6_1 Depth=1 +; RV64IB-ZALRSC-NEXT: sc.d.rl a3, a3, (a0) +; RV64IB-ZALRSC-NEXT: bnez a3, .LBB6_1 +; RV64IB-ZALRSC-NEXT: # %bb.4: +; RV64IB-ZALRSC-NEXT: mv a0, a2 +; RV64IB-ZALRSC-NEXT: ret +; +; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_umax_i64_seq_cst: +; RV64IB-ZALRSC-PERM: # %bb.0: +; RV64IB-ZALRSC-PERM-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-PERM-NEXT: lr.d.aqrl a2, (a0) +; RV64IB-ZALRSC-PERM-NEXT: maxu a3, a2, a1 +; RV64IB-ZALRSC-PERM-NEXT: sc.d.rl a3, a3, (a0) +; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB6_1 +; RV64IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV64IB-ZALRSC-PERM-NEXT: mv a0, a2 +; RV64IB-ZALRSC-PERM-NEXT: ret +; +; RV64IAB-LABEL: atomicrmw_umax_i64_seq_cst: +; RV64IAB: # %bb.0: +; RV64IAB-NEXT: amomaxu.d.aqrl a0, a1, (a0) +; RV64IAB-NEXT: ret + %1 = atomicrmw umax ptr %a, i64 %b seq_cst + ret i64 %1 +} + +define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind { +; RV32IB-COMMON-LABEL: atomicrmw_umin_i64_seq_cst: +; RV32IB-COMMON: # %bb.0: +; RV32IB-COMMON-NEXT: addi sp, sp, -32 +; RV32IB-COMMON-NEXT: sw ra, 28(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s0, 24(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s1, 20(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: sw s2, 16(sp) # 4-byte Folded Spill +; RV32IB-COMMON-NEXT: mv s0, a2 +; RV32IB-COMMON-NEXT: mv s1, a0 +; RV32IB-COMMON-NEXT: lw a4, 0(a0) +; RV32IB-COMMON-NEXT: lw a5, 4(a0) +; RV32IB-COMMON-NEXT: mv s2, a1 +; RV32IB-COMMON-NEXT: j .LBB7_2 +; RV32IB-COMMON-NEXT: .LBB7_1: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB7_2 Depth=1 +; RV32IB-COMMON-NEXT: sw a4, 8(sp) +; RV32IB-COMMON-NEXT: sw a5, 12(sp) +; RV32IB-COMMON-NEXT: addi a1, sp, 8 +; RV32IB-COMMON-NEXT: li a4, 5 +; RV32IB-COMMON-NEXT: li a5, 5 +; RV32IB-COMMON-NEXT: mv a0, s1 +; RV32IB-COMMON-NEXT: call __atomic_compare_exchange_8 +; RV32IB-COMMON-NEXT: lw a4, 8(sp) +; RV32IB-COMMON-NEXT: lw a5, 12(sp) +; RV32IB-COMMON-NEXT: bnez a0, .LBB7_7 +; RV32IB-COMMON-NEXT: .LBB7_2: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # =>This Inner Loop Header: Depth=1 +; RV32IB-COMMON-NEXT: beq a5, s0, .LBB7_4 +; RV32IB-COMMON-NEXT: # %bb.3: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB7_2 Depth=1 +; RV32IB-COMMON-NEXT: sltu a0, a5, s0 +; RV32IB-COMMON-NEXT: j .LBB7_5 +; RV32IB-COMMON-NEXT: .LBB7_4: # in Loop: Header=BB7_2 Depth=1 +; RV32IB-COMMON-NEXT: sltu a0, a4, s2 +; RV32IB-COMMON-NEXT: .LBB7_5: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB7_2 Depth=1 +; RV32IB-COMMON-NEXT: mv a2, a4 +; RV32IB-COMMON-NEXT: mv a3, a5 +; RV32IB-COMMON-NEXT: bnez a0, .LBB7_1 +; RV32IB-COMMON-NEXT: # %bb.6: # %atomicrmw.start +; RV32IB-COMMON-NEXT: # in Loop: Header=BB7_2 Depth=1 +; RV32IB-COMMON-NEXT: mv a2, s2 +; RV32IB-COMMON-NEXT: mv a3, s0 +; RV32IB-COMMON-NEXT: j .LBB7_1 +; RV32IB-COMMON-NEXT: .LBB7_7: # %atomicrmw.end +; RV32IB-COMMON-NEXT: mv a0, a4 +; RV32IB-COMMON-NEXT: mv a1, a5 +; RV32IB-COMMON-NEXT: lw ra, 28(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s0, 24(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s1, 20(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: lw s2, 16(sp) # 4-byte Folded Reload +; RV32IB-COMMON-NEXT: addi sp, sp, 32 +; RV32IB-COMMON-NEXT: ret +; +; RV64IB-ZALRSC-LABEL: atomicrmw_umin_i64_seq_cst: +; RV64IB-ZALRSC: # %bb.0: +; RV64IB-ZALRSC-NEXT: .LBB7_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-NEXT: lr.d.aqrl a2, (a0) +; RV64IB-ZALRSC-NEXT: mv a3, a2 +; RV64IB-ZALRSC-NEXT: bgeu a1, a3, .LBB7_3 +; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB7_1 Depth=1 +; RV64IB-ZALRSC-NEXT: mv a3, a1 +; RV64IB-ZALRSC-NEXT: .LBB7_3: # in Loop: Header=BB7_1 Depth=1 +; RV64IB-ZALRSC-NEXT: sc.d.rl a3, a3, (a0) +; RV64IB-ZALRSC-NEXT: bnez a3, .LBB7_1 +; RV64IB-ZALRSC-NEXT: # %bb.4: +; RV64IB-ZALRSC-NEXT: mv a0, a2 +; RV64IB-ZALRSC-NEXT: ret +; +; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_umin_i64_seq_cst: +; RV64IB-ZALRSC-PERM: # %bb.0: +; RV64IB-ZALRSC-PERM-NEXT: .LBB7_1: # =>This Inner Loop Header: Depth=1 +; RV64IB-ZALRSC-PERM-NEXT: lr.d.aqrl a2, (a0) +; RV64IB-ZALRSC-PERM-NEXT: minu a3, a2, a1 +; RV64IB-ZALRSC-PERM-NEXT: sc.d.rl a3, a3, (a0) +; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB7_1 +; RV64IB-ZALRSC-PERM-NEXT: # %bb.2: +; RV64IB-ZALRSC-PERM-NEXT: mv a0, a2 +; RV64IB-ZALRSC-PERM-NEXT: ret +; +; RV64IAB-LABEL: atomicrmw_umin_i64_seq_cst: +; RV64IAB: # %bb.0: +; RV64IAB-NEXT: amominu.d.aqrl a0, a1, (a0) +; RV64IAB-NEXT: ret + %1 = atomicrmw umin ptr %a, i64 %b seq_cst + ret i64 %1 +} diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll index 5e5f2b7..37e11db 100644 --- a/llvm/test/CodeGen/RISCV/features-info.ll +++ b/llvm/test/CodeGen/RISCV/features-info.ll @@ -81,6 +81,7 @@ ; CHECK-NEXT: optimized-nf7-segment-load-store - vlseg7eN.v and vsseg7eN.v are implemented as a wide memory op and shuffle. ; CHECK-NEXT: optimized-nf8-segment-load-store - vlseg8eN.v and vsseg8eN.v are implemented as a wide memory op and shuffle. ; CHECK-NEXT: optimized-zero-stride-load - Optimized (perform fewer memory operations)zero-stride vector load. +; CHECK-NEXT: permissive-zalrsc - Implementation permits non-base instructions between LR/SC pairs. ; CHECK-NEXT: predictable-select-expensive - Prefer likely predicted branches over selects. ; CHECK-NEXT: prefer-vsetvli-over-read-vlenb - Prefer vsetvli over read vlenb CSR to calculate VLEN. ; CHECK-NEXT: prefer-w-inst - Prefer instructions with W suffix. diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveMin.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveMin.ll new file mode 100644 index 0000000..d121c1a --- /dev/null +++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveMin.ll @@ -0,0 +1,57 @@ +; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-vulkan-unknown %s -o - | FileCheck %s +; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-unknown %s -o - -filetype=obj | spirv-val %} + +; Test lowering to spir-v backend for various types and scalar/vector + +; CHECK: OpCapability GroupNonUniformArithmetic + +; CHECK-DAG: %[[#f16:]] = OpTypeFloat 16 +; CHECK-DAG: %[[#f32:]] = OpTypeFloat 32 +; CHECK-DAG: %[[#uint:]] = OpTypeInt 32 0 +; CHECK-DAG: %[[#v4_half:]] = OpTypeVector %[[#f16]] 4 +; CHECK-DAG: %[[#scope:]] = OpConstant %[[#uint]] 3 + +; CHECK-LABEL: Begin function test_float +; CHECK: %[[#fexpr:]] = OpFunctionParameter %[[#f32]] +define float @test_float(float %fexpr) { +entry: +; CHECK: %[[#fret:]] = OpGroupNonUniformFMin %[[#f32]] %[[#scope]] Reduce %[[#fexpr]] + %0 = call float @llvm.spv.wave.reduce.min.f32(float %fexpr) + ret float %0 +} + +; CHECK-LABEL: Begin function test_int_signed +; CHECK: %[[#iexpr:]] = OpFunctionParameter %[[#uint]] +define i32 @test_int_signed(i32 %iexpr) { +entry: +; CHECK: %[[#iret:]] = OpGroupNonUniformSMin %[[#uint]] %[[#scope]] Reduce %[[#iexpr]] + %0 = call i32 @llvm.spv.wave.reduce.min.i32(i32 %iexpr) + ret i32 %0 +} + +; CHECK-LABEL: Begin function test_int_unsigned +; CHECK: %[[#iexpr:]] = OpFunctionParameter %[[#uint]] +define i32 @test_int_unsigned(i32 %iexpr) { +entry: +; CHECK: %[[#iret:]] = OpGroupNonUniformUMin %[[#uint]] %[[#scope]] Reduce %[[#iexpr]] + %0 = call i32 @llvm.spv.wave.reduce.umin.i32(i32 %iexpr) + ret i32 %0 +} + +; CHECK-LABEL: Begin function test_vhalf +; CHECK: %[[#vbexpr:]] = OpFunctionParameter %[[#v4_half]] +define <4 x half> @test_vhalf(<4 x half> %vbexpr) { +entry: +; CHECK: %[[#vhalfret:]] = OpGroupNonUniformFMin %[[#v4_half]] %[[#scope]] Reduce %[[#vbexpr]] + %0 = call <4 x half> @llvm.spv.wave.reduce.min.v4half(<4 x half> %vbexpr) + ret <4 x half> %0 +} + +declare float @llvm.spv.wave.reduce.min.f32(float) +declare i32 @llvm.spv.wave.reduce.min.i32(i32) +declare <4 x half> @llvm.spv.wave.reduce.min.v4half(<4 x half>) + +declare float @llvm.spv.wave.reduce.umin.f32(float) +declare i32 @llvm.spv.wave.reduce.umin.i32(i32) +declare <4 x half> @llvm.spv.wave.reduce.umin.v4half(<4 x half>) + |
