aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen
diff options
context:
space:
mode:
Diffstat (limited to 'llvm/test/CodeGen')
-rw-r--r--llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll888
-rw-r--r--llvm/test/CodeGen/AArch64/adc.ll6
-rw-r--r--llvm/test/CodeGen/AArch64/fsh.ll473
-rw-r--r--llvm/test/CodeGen/AArch64/funnel-shift.ll55
-rw-r--r--llvm/test/CodeGen/AArch64/rem-by-const.ll173
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll24
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir36
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll16
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.noret.ll581
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll30
-rw-r--r--llvm/test/CodeGen/DirectX/ShaderFlags/wave-ops.ll14
-rw-r--r--llvm/test/CodeGen/DirectX/WaveActiveMin.ll143
-rw-r--r--llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll160
-rw-r--r--llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll112
-rw-r--r--llvm/test/CodeGen/RISCV/atomic-rmw-minmax.ll642
-rw-r--r--llvm/test/CodeGen/RISCV/features-info.ll1
-rw-r--r--llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveMin.ll57
19 files changed, 2469 insertions, 968 deletions
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
index 41f7ab8..480fcbd 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/split-wide-shifts-multiway.ll
@@ -4992,28 +4992,21 @@ define void @test_shl_i512_const_32(ptr %result, ptr %input) {
; GISEL-LABEL: test_shl_i512_const_32:
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: ldp x8, x9, [x1]
-; GISEL-NEXT: ldp x11, x12, [x1, #16]
-; GISEL-NEXT: ldp x14, x15, [x1, #32]
-; GISEL-NEXT: lsr x10, x8, #32
-; GISEL-NEXT: lsr x13, x9, #32
-; GISEL-NEXT: lsl x8, x8, #32
-; GISEL-NEXT: orr x9, x10, x9, lsl #32
-; GISEL-NEXT: lsr x10, x11, #32
-; GISEL-NEXT: orr x11, x13, x11, lsl #32
-; GISEL-NEXT: ldp x13, x16, [x1, #48]
-; GISEL-NEXT: stp x8, x9, [x0]
-; GISEL-NEXT: lsr x8, x12, #32
-; GISEL-NEXT: orr x10, x10, x12, lsl #32
-; GISEL-NEXT: lsr x12, x14, #32
-; GISEL-NEXT: lsr x9, x15, #32
-; GISEL-NEXT: orr x8, x8, x14, lsl #32
-; GISEL-NEXT: stp x11, x10, [x0, #16]
-; GISEL-NEXT: orr x11, x12, x15, lsl #32
-; GISEL-NEXT: lsr x12, x13, #32
-; GISEL-NEXT: orr x9, x9, x13, lsl #32
-; GISEL-NEXT: stp x8, x11, [x0, #32]
-; GISEL-NEXT: orr x8, x12, x16, lsl #32
-; GISEL-NEXT: stp x9, x8, [x0, #48]
+; GISEL-NEXT: ldp x10, x11, [x1, #16]
+; GISEL-NEXT: ldp x13, x14, [x1, #32]
+; GISEL-NEXT: lsl x12, x8, #32
+; GISEL-NEXT: extr x8, x9, x8, #32
+; GISEL-NEXT: extr x9, x10, x9, #32
+; GISEL-NEXT: extr x10, x11, x10, #32
+; GISEL-NEXT: ldp x15, x16, [x1, #48]
+; GISEL-NEXT: stp x12, x8, [x0]
+; GISEL-NEXT: extr x8, x13, x11, #32
+; GISEL-NEXT: stp x9, x10, [x0, #16]
+; GISEL-NEXT: extr x9, x14, x13, #32
+; GISEL-NEXT: extr x10, x15, x14, #32
+; GISEL-NEXT: stp x8, x9, [x0, #32]
+; GISEL-NEXT: extr x8, x16, x15, #32
+; GISEL-NEXT: stp x10, x8, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5044,30 +5037,22 @@ define void @test_lshr_i512_const_32(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_lshr_i512_const_32:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #8]
-; GISEL-NEXT: ldr x11, [x1]
-; GISEL-NEXT: ldp x10, x14, [x1, #24]
-; GISEL-NEXT: ldr x16, [x1, #56]
-; GISEL-NEXT: lsl x12, x8, #32
-; GISEL-NEXT: lsl x13, x9, #32
-; GISEL-NEXT: lsl x15, x10, #32
-; GISEL-NEXT: orr x11, x12, x11, lsr #32
-; GISEL-NEXT: orr x8, x13, x8, lsr #32
-; GISEL-NEXT: lsl x13, x14, #32
-; GISEL-NEXT: orr x9, x15, x9, lsr #32
-; GISEL-NEXT: ldp x12, x15, [x1, #40]
-; GISEL-NEXT: stp x11, x8, [x0]
-; GISEL-NEXT: orr x10, x13, x10, lsr #32
-; GISEL-NEXT: lsl x8, x16, #32
-; GISEL-NEXT: lsl x11, x12, #32
-; GISEL-NEXT: lsl x13, x15, #32
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: orr x8, x8, x15, lsr #32
-; GISEL-NEXT: lsr x10, x16, #32
-; GISEL-NEXT: orr x11, x11, x14, lsr #32
-; GISEL-NEXT: orr x9, x13, x12, lsr #32
-; GISEL-NEXT: stp x8, x10, [x0, #48]
-; GISEL-NEXT: stp x11, x9, [x0, #32]
+; GISEL-NEXT: ldp x8, x9, [x1]
+; GISEL-NEXT: ldp x10, x11, [x1, #16]
+; GISEL-NEXT: ldp x12, x13, [x1, #32]
+; GISEL-NEXT: extr x8, x9, x8, #32
+; GISEL-NEXT: ldp x14, x15, [x1, #48]
+; GISEL-NEXT: extr x9, x10, x9, #32
+; GISEL-NEXT: extr x10, x11, x10, #32
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x12, x11, #32
+; GISEL-NEXT: extr x9, x13, x12, #32
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: extr x10, x14, x13, #32
+; GISEL-NEXT: extr x8, x15, x14, #32
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: lsr x9, x15, #32
+; GISEL-NEXT: stp x8, x9, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5098,32 +5083,24 @@ define void @test_ashr_i512_const_32(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_ashr_i512_const_32:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #8]
-; GISEL-NEXT: ldr x11, [x1]
-; GISEL-NEXT: ldp x10, x13, [x1, #24]
-; GISEL-NEXT: ldr x17, [x1, #56]
-; GISEL-NEXT: lsl x12, x8, #32
-; GISEL-NEXT: lsl x15, x9, #32
-; GISEL-NEXT: lsl x16, x10, #32
-; GISEL-NEXT: orr x11, x12, x11, lsr #32
-; GISEL-NEXT: ldp x14, x12, [x1, #40]
-; GISEL-NEXT: orr x8, x15, x8, lsr #32
-; GISEL-NEXT: lsl x15, x13, #32
-; GISEL-NEXT: orr x9, x16, x9, lsr #32
-; GISEL-NEXT: asr x16, x17, #63
-; GISEL-NEXT: stp x11, x8, [x0]
-; GISEL-NEXT: lsl x11, x14, #32
-; GISEL-NEXT: orr x10, x15, x10, lsr #32
-; GISEL-NEXT: lsl x15, x12, #32
-; GISEL-NEXT: orr x8, x11, x13, lsr #32
-; GISEL-NEXT: lsl x11, x17, #32
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: orr x9, x15, x14, lsr #32
-; GISEL-NEXT: lsl x13, x16, #32
-; GISEL-NEXT: orr x10, x11, x12, lsr #32
-; GISEL-NEXT: stp x8, x9, [x0, #32]
-; GISEL-NEXT: orr x8, x13, x17, asr #32
-; GISEL-NEXT: stp x10, x8, [x0, #48]
+; GISEL-NEXT: ldp x8, x9, [x1]
+; GISEL-NEXT: ldp x10, x11, [x1, #16]
+; GISEL-NEXT: ldp x12, x13, [x1, #48]
+; GISEL-NEXT: extr x8, x9, x8, #32
+; GISEL-NEXT: ldp x14, x15, [x1, #32]
+; GISEL-NEXT: extr x9, x10, x9, #32
+; GISEL-NEXT: extr x10, x11, x10, #32
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: asr x8, x13, #63
+; GISEL-NEXT: extr x11, x14, x11, #32
+; GISEL-NEXT: extr x9, x15, x14, #32
+; GISEL-NEXT: lsl x8, x8, #32
+; GISEL-NEXT: stp x10, x11, [x0, #16]
+; GISEL-NEXT: extr x10, x12, x15, #32
+; GISEL-NEXT: extr x11, x13, x12, #32
+; GISEL-NEXT: orr x8, x8, x13, asr #32
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: stp x11, x8, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5252,23 +5229,17 @@ define void @test_shl_i512_const_96(ptr %result, ptr %input) {
; GISEL-NEXT: ldr x15, [x1, #48]
; GISEL-NEXT: ldp x10, x11, [x1, #16]
; GISEL-NEXT: ldp x12, x13, [x1, #32]
-; GISEL-NEXT: lsr x14, x8, #32
-; GISEL-NEXT: lsr x16, x9, #32
-; GISEL-NEXT: lsl x8, x8, #32
-; GISEL-NEXT: orr x9, x14, x9, lsl #32
-; GISEL-NEXT: lsr x14, x10, #32
-; GISEL-NEXT: orr x10, x16, x10, lsl #32
-; GISEL-NEXT: stp xzr, x8, [x0]
-; GISEL-NEXT: lsr x8, x11, #32
-; GISEL-NEXT: orr x11, x14, x11, lsl #32
-; GISEL-NEXT: lsr x14, x12, #32
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: lsr x9, x13, #32
-; GISEL-NEXT: orr x8, x8, x12, lsl #32
-; GISEL-NEXT: orr x10, x14, x13, lsl #32
-; GISEL-NEXT: orr x9, x9, x15, lsl #32
-; GISEL-NEXT: stp x11, x8, [x0, #32]
-; GISEL-NEXT: stp x10, x9, [x0, #48]
+; GISEL-NEXT: lsl x14, x8, #32
+; GISEL-NEXT: extr x8, x9, x8, #32
+; GISEL-NEXT: extr x9, x10, x9, #32
+; GISEL-NEXT: extr x10, x11, x10, #32
+; GISEL-NEXT: stp xzr, x14, [x0]
+; GISEL-NEXT: stp x8, x9, [x0, #16]
+; GISEL-NEXT: extr x8, x12, x11, #32
+; GISEL-NEXT: extr x9, x13, x12, #32
+; GISEL-NEXT: stp x10, x8, [x0, #32]
+; GISEL-NEXT: extr x10, x15, x13, #32
+; GISEL-NEXT: stp x9, x10, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5297,27 +5268,21 @@ define void @test_lshr_i512_const_96(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_lshr_i512_const_96:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #16]
-; GISEL-NEXT: ldr x10, [x1, #8]
-; GISEL-NEXT: ldp x11, x14, [x1, #32]
-; GISEL-NEXT: ldp x15, x16, [x1, #48]
-; GISEL-NEXT: lsl x12, x8, #32
-; GISEL-NEXT: lsl x13, x9, #32
-; GISEL-NEXT: orr x10, x12, x10, lsr #32
-; GISEL-NEXT: lsl x12, x11, #32
-; GISEL-NEXT: orr x8, x13, x8, lsr #32
-; GISEL-NEXT: lsl x13, x14, #32
-; GISEL-NEXT: orr x9, x12, x9, lsr #32
-; GISEL-NEXT: stp x10, x8, [x0]
-; GISEL-NEXT: lsl x10, x15, #32
-; GISEL-NEXT: orr x11, x13, x11, lsr #32
-; GISEL-NEXT: lsl x12, x16, #32
-; GISEL-NEXT: orr x8, x10, x14, lsr #32
-; GISEL-NEXT: lsr x10, x16, #32
-; GISEL-NEXT: stp x9, x11, [x0, #16]
-; GISEL-NEXT: orr x9, x12, x15, lsr #32
-; GISEL-NEXT: stp x10, xzr, [x0, #48]
-; GISEL-NEXT: stp x8, x9, [x0, #32]
+; GISEL-NEXT: ldp x8, x9, [x1, #8]
+; GISEL-NEXT: ldr x14, [x1, #56]
+; GISEL-NEXT: ldp x10, x11, [x1, #24]
+; GISEL-NEXT: ldp x12, x13, [x1, #40]
+; GISEL-NEXT: extr x8, x9, x8, #32
+; GISEL-NEXT: extr x9, x10, x9, #32
+; GISEL-NEXT: extr x10, x11, x10, #32
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x12, x11, #32
+; GISEL-NEXT: extr x9, x13, x12, #32
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: extr x10, x14, x13, #32
+; GISEL-NEXT: lsr x8, x14, #32
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: stp x8, xzr, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5347,29 +5312,23 @@ define void @test_ashr_i512_const_96(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_ashr_i512_const_96:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #16]
-; GISEL-NEXT: ldr x11, [x1, #8]
-; GISEL-NEXT: ldp x10, x13, [x1, #32]
-; GISEL-NEXT: lsl x12, x8, #32
-; GISEL-NEXT: lsl x14, x9, #32
-; GISEL-NEXT: lsl x15, x10, #32
-; GISEL-NEXT: orr x11, x12, x11, lsr #32
-; GISEL-NEXT: ldp x12, x16, [x1, #48]
-; GISEL-NEXT: orr x8, x14, x8, lsr #32
-; GISEL-NEXT: lsl x14, x13, #32
-; GISEL-NEXT: orr x9, x15, x9, lsr #32
-; GISEL-NEXT: asr x15, x16, #63
-; GISEL-NEXT: stp x11, x8, [x0]
-; GISEL-NEXT: lsl x11, x12, #32
-; GISEL-NEXT: orr x10, x14, x10, lsr #32
-; GISEL-NEXT: lsl x14, x16, #32
-; GISEL-NEXT: orr x8, x11, x13, lsr #32
+; GISEL-NEXT: ldp x8, x9, [x1, #8]
+; GISEL-NEXT: ldr x13, [x1, #40]
+; GISEL-NEXT: ldp x10, x11, [x1, #24]
+; GISEL-NEXT: ldp x14, x12, [x1, #48]
+; GISEL-NEXT: extr x8, x9, x8, #32
+; GISEL-NEXT: extr x9, x10, x9, #32
+; GISEL-NEXT: extr x10, x11, x10, #32
+; GISEL-NEXT: asr x15, x12, #63
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x13, x11, #32
+; GISEL-NEXT: extr x9, x14, x13, #32
; GISEL-NEXT: lsl x11, x15, #32
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: orr x9, x14, x12, lsr #32
-; GISEL-NEXT: orr x10, x11, x16, asr #32
-; GISEL-NEXT: stp x8, x9, [x0, #32]
-; GISEL-NEXT: stp x10, x15, [x0, #48]
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: extr x10, x12, x14, #32
+; GISEL-NEXT: orr x8, x11, x12, asr #32
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: stp x8, x15, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5404,28 +5363,21 @@ define void @test_shl_i512_const_1(ptr %result, ptr %input) {
; GISEL-LABEL: test_shl_i512_const_1:
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: ldp x8, x9, [x1]
-; GISEL-NEXT: ldp x11, x12, [x1, #16]
-; GISEL-NEXT: ldp x14, x15, [x1, #32]
-; GISEL-NEXT: lsr x10, x8, #63
-; GISEL-NEXT: lsr x13, x9, #63
-; GISEL-NEXT: lsl x8, x8, #1
-; GISEL-NEXT: orr x9, x10, x9, lsl #1
-; GISEL-NEXT: lsr x10, x11, #63
-; GISEL-NEXT: orr x11, x13, x11, lsl #1
-; GISEL-NEXT: ldp x13, x16, [x1, #48]
-; GISEL-NEXT: stp x8, x9, [x0]
-; GISEL-NEXT: lsr x8, x12, #63
-; GISEL-NEXT: orr x10, x10, x12, lsl #1
-; GISEL-NEXT: lsr x12, x14, #63
-; GISEL-NEXT: lsr x9, x15, #63
-; GISEL-NEXT: orr x8, x8, x14, lsl #1
-; GISEL-NEXT: stp x11, x10, [x0, #16]
-; GISEL-NEXT: orr x11, x12, x15, lsl #1
-; GISEL-NEXT: lsr x12, x13, #63
-; GISEL-NEXT: orr x9, x9, x13, lsl #1
-; GISEL-NEXT: stp x8, x11, [x0, #32]
-; GISEL-NEXT: orr x8, x12, x16, lsl #1
-; GISEL-NEXT: stp x9, x8, [x0, #48]
+; GISEL-NEXT: ldp x10, x11, [x1, #16]
+; GISEL-NEXT: ldp x13, x14, [x1, #32]
+; GISEL-NEXT: lsl x12, x8, #1
+; GISEL-NEXT: extr x8, x9, x8, #63
+; GISEL-NEXT: extr x9, x10, x9, #63
+; GISEL-NEXT: extr x10, x11, x10, #63
+; GISEL-NEXT: ldp x15, x16, [x1, #48]
+; GISEL-NEXT: stp x12, x8, [x0]
+; GISEL-NEXT: extr x8, x13, x11, #63
+; GISEL-NEXT: stp x9, x10, [x0, #16]
+; GISEL-NEXT: extr x9, x14, x13, #63
+; GISEL-NEXT: extr x10, x15, x14, #63
+; GISEL-NEXT: stp x8, x9, [x0, #32]
+; GISEL-NEXT: extr x8, x16, x15, #63
+; GISEL-NEXT: stp x10, x8, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5457,30 +5409,22 @@ define void @test_lshr_i512_const_1(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_lshr_i512_const_1:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #8]
-; GISEL-NEXT: ldr x11, [x1]
-; GISEL-NEXT: ldp x10, x14, [x1, #24]
-; GISEL-NEXT: ldr x16, [x1, #56]
-; GISEL-NEXT: lsl x12, x8, #63
-; GISEL-NEXT: lsl x13, x9, #63
-; GISEL-NEXT: lsl x15, x10, #63
-; GISEL-NEXT: orr x11, x12, x11, lsr #1
-; GISEL-NEXT: orr x8, x13, x8, lsr #1
-; GISEL-NEXT: lsl x13, x14, #63
-; GISEL-NEXT: orr x9, x15, x9, lsr #1
-; GISEL-NEXT: ldp x12, x15, [x1, #40]
-; GISEL-NEXT: stp x11, x8, [x0]
-; GISEL-NEXT: orr x10, x13, x10, lsr #1
-; GISEL-NEXT: lsl x8, x16, #63
-; GISEL-NEXT: lsl x11, x12, #63
-; GISEL-NEXT: lsl x13, x15, #63
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: orr x8, x8, x15, lsr #1
-; GISEL-NEXT: lsr x10, x16, #1
-; GISEL-NEXT: orr x11, x11, x14, lsr #1
-; GISEL-NEXT: orr x9, x13, x12, lsr #1
-; GISEL-NEXT: stp x8, x10, [x0, #48]
-; GISEL-NEXT: stp x11, x9, [x0, #32]
+; GISEL-NEXT: ldp x8, x9, [x1]
+; GISEL-NEXT: ldp x10, x11, [x1, #16]
+; GISEL-NEXT: ldp x12, x13, [x1, #32]
+; GISEL-NEXT: extr x8, x9, x8, #1
+; GISEL-NEXT: ldp x14, x15, [x1, #48]
+; GISEL-NEXT: extr x9, x10, x9, #1
+; GISEL-NEXT: extr x10, x11, x10, #1
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x12, x11, #1
+; GISEL-NEXT: extr x9, x13, x12, #1
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: extr x10, x14, x13, #1
+; GISEL-NEXT: extr x8, x15, x14, #1
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: lsr x9, x15, #1
+; GISEL-NEXT: stp x8, x9, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5512,32 +5456,24 @@ define void @test_ashr_i512_const_1(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_ashr_i512_const_1:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #8]
-; GISEL-NEXT: ldr x11, [x1]
-; GISEL-NEXT: ldp x10, x13, [x1, #24]
-; GISEL-NEXT: ldr x17, [x1, #56]
-; GISEL-NEXT: lsl x12, x8, #63
-; GISEL-NEXT: lsl x15, x9, #63
-; GISEL-NEXT: lsl x16, x10, #63
-; GISEL-NEXT: orr x11, x12, x11, lsr #1
-; GISEL-NEXT: ldp x14, x12, [x1, #40]
-; GISEL-NEXT: orr x8, x15, x8, lsr #1
-; GISEL-NEXT: lsl x15, x13, #63
-; GISEL-NEXT: orr x9, x16, x9, lsr #1
-; GISEL-NEXT: asr x16, x17, #63
-; GISEL-NEXT: stp x11, x8, [x0]
-; GISEL-NEXT: lsl x11, x14, #63
-; GISEL-NEXT: orr x10, x15, x10, lsr #1
-; GISEL-NEXT: lsl x15, x12, #63
-; GISEL-NEXT: orr x8, x11, x13, lsr #1
-; GISEL-NEXT: lsl x11, x17, #63
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: orr x9, x15, x14, lsr #1
-; GISEL-NEXT: lsl x13, x16, #63
-; GISEL-NEXT: orr x10, x11, x12, lsr #1
-; GISEL-NEXT: stp x8, x9, [x0, #32]
-; GISEL-NEXT: orr x8, x13, x17, asr #1
-; GISEL-NEXT: stp x10, x8, [x0, #48]
+; GISEL-NEXT: ldp x8, x9, [x1]
+; GISEL-NEXT: ldp x10, x11, [x1, #16]
+; GISEL-NEXT: ldp x12, x13, [x1, #48]
+; GISEL-NEXT: extr x8, x9, x8, #1
+; GISEL-NEXT: ldp x14, x15, [x1, #32]
+; GISEL-NEXT: extr x9, x10, x9, #1
+; GISEL-NEXT: extr x10, x11, x10, #1
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: asr x8, x13, #63
+; GISEL-NEXT: extr x11, x14, x11, #1
+; GISEL-NEXT: extr x9, x15, x14, #1
+; GISEL-NEXT: lsl x8, x8, #63
+; GISEL-NEXT: stp x10, x11, [x0, #16]
+; GISEL-NEXT: extr x10, x12, x15, #1
+; GISEL-NEXT: extr x11, x13, x12, #1
+; GISEL-NEXT: orr x8, x8, x13, asr #1
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: stp x11, x8, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5571,28 +5507,21 @@ define void @test_shl_i512_const_15(ptr %result, ptr %input) {
; GISEL-LABEL: test_shl_i512_const_15:
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: ldp x8, x9, [x1]
-; GISEL-NEXT: ldp x11, x12, [x1, #16]
-; GISEL-NEXT: ldp x14, x15, [x1, #32]
-; GISEL-NEXT: lsr x10, x8, #49
-; GISEL-NEXT: lsr x13, x9, #49
-; GISEL-NEXT: lsl x8, x8, #15
-; GISEL-NEXT: orr x9, x10, x9, lsl #15
-; GISEL-NEXT: lsr x10, x11, #49
-; GISEL-NEXT: orr x11, x13, x11, lsl #15
-; GISEL-NEXT: ldp x13, x16, [x1, #48]
-; GISEL-NEXT: stp x8, x9, [x0]
-; GISEL-NEXT: lsr x8, x12, #49
-; GISEL-NEXT: orr x10, x10, x12, lsl #15
-; GISEL-NEXT: lsr x12, x14, #49
-; GISEL-NEXT: lsr x9, x15, #49
-; GISEL-NEXT: orr x8, x8, x14, lsl #15
-; GISEL-NEXT: stp x11, x10, [x0, #16]
-; GISEL-NEXT: orr x11, x12, x15, lsl #15
-; GISEL-NEXT: lsr x12, x13, #49
-; GISEL-NEXT: orr x9, x9, x13, lsl #15
-; GISEL-NEXT: stp x8, x11, [x0, #32]
-; GISEL-NEXT: orr x8, x12, x16, lsl #15
-; GISEL-NEXT: stp x9, x8, [x0, #48]
+; GISEL-NEXT: ldp x10, x11, [x1, #16]
+; GISEL-NEXT: ldp x13, x14, [x1, #32]
+; GISEL-NEXT: lsl x12, x8, #15
+; GISEL-NEXT: extr x8, x9, x8, #49
+; GISEL-NEXT: extr x9, x10, x9, #49
+; GISEL-NEXT: extr x10, x11, x10, #49
+; GISEL-NEXT: ldp x15, x16, [x1, #48]
+; GISEL-NEXT: stp x12, x8, [x0]
+; GISEL-NEXT: extr x8, x13, x11, #49
+; GISEL-NEXT: stp x9, x10, [x0, #16]
+; GISEL-NEXT: extr x9, x14, x13, #49
+; GISEL-NEXT: extr x10, x15, x14, #49
+; GISEL-NEXT: stp x8, x9, [x0, #32]
+; GISEL-NEXT: extr x8, x16, x15, #49
+; GISEL-NEXT: stp x10, x8, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5624,30 +5553,22 @@ define void @test_lshr_i512_const_15(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_lshr_i512_const_15:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #8]
-; GISEL-NEXT: ldr x11, [x1]
-; GISEL-NEXT: ldp x10, x14, [x1, #24]
-; GISEL-NEXT: ldr x16, [x1, #56]
-; GISEL-NEXT: lsl x12, x8, #49
-; GISEL-NEXT: lsl x13, x9, #49
-; GISEL-NEXT: lsl x15, x10, #49
-; GISEL-NEXT: orr x11, x12, x11, lsr #15
-; GISEL-NEXT: orr x8, x13, x8, lsr #15
-; GISEL-NEXT: lsl x13, x14, #49
-; GISEL-NEXT: orr x9, x15, x9, lsr #15
-; GISEL-NEXT: ldp x12, x15, [x1, #40]
-; GISEL-NEXT: stp x11, x8, [x0]
-; GISEL-NEXT: orr x10, x13, x10, lsr #15
-; GISEL-NEXT: lsl x8, x16, #49
-; GISEL-NEXT: lsl x11, x12, #49
-; GISEL-NEXT: lsl x13, x15, #49
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: orr x8, x8, x15, lsr #15
-; GISEL-NEXT: lsr x10, x16, #15
-; GISEL-NEXT: orr x11, x11, x14, lsr #15
-; GISEL-NEXT: orr x9, x13, x12, lsr #15
-; GISEL-NEXT: stp x8, x10, [x0, #48]
-; GISEL-NEXT: stp x11, x9, [x0, #32]
+; GISEL-NEXT: ldp x8, x9, [x1]
+; GISEL-NEXT: ldp x10, x11, [x1, #16]
+; GISEL-NEXT: ldp x12, x13, [x1, #32]
+; GISEL-NEXT: extr x8, x9, x8, #15
+; GISEL-NEXT: ldp x14, x15, [x1, #48]
+; GISEL-NEXT: extr x9, x10, x9, #15
+; GISEL-NEXT: extr x10, x11, x10, #15
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x12, x11, #15
+; GISEL-NEXT: extr x9, x13, x12, #15
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: extr x10, x14, x13, #15
+; GISEL-NEXT: extr x8, x15, x14, #15
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: lsr x9, x15, #15
+; GISEL-NEXT: stp x8, x9, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5679,32 +5600,24 @@ define void @test_ashr_i512_const_15(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_ashr_i512_const_15:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #8]
-; GISEL-NEXT: ldr x11, [x1]
-; GISEL-NEXT: ldp x10, x13, [x1, #24]
-; GISEL-NEXT: ldr x17, [x1, #56]
-; GISEL-NEXT: lsl x12, x8, #49
-; GISEL-NEXT: lsl x15, x9, #49
-; GISEL-NEXT: lsl x16, x10, #49
-; GISEL-NEXT: orr x11, x12, x11, lsr #15
-; GISEL-NEXT: ldp x14, x12, [x1, #40]
-; GISEL-NEXT: orr x8, x15, x8, lsr #15
-; GISEL-NEXT: lsl x15, x13, #49
-; GISEL-NEXT: orr x9, x16, x9, lsr #15
-; GISEL-NEXT: asr x16, x17, #63
-; GISEL-NEXT: stp x11, x8, [x0]
-; GISEL-NEXT: lsl x11, x14, #49
-; GISEL-NEXT: orr x10, x15, x10, lsr #15
-; GISEL-NEXT: lsl x15, x12, #49
-; GISEL-NEXT: orr x8, x11, x13, lsr #15
-; GISEL-NEXT: lsl x11, x17, #49
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: orr x9, x15, x14, lsr #15
-; GISEL-NEXT: lsl x13, x16, #49
-; GISEL-NEXT: orr x10, x11, x12, lsr #15
-; GISEL-NEXT: stp x8, x9, [x0, #32]
-; GISEL-NEXT: orr x8, x13, x17, asr #15
-; GISEL-NEXT: stp x10, x8, [x0, #48]
+; GISEL-NEXT: ldp x8, x9, [x1]
+; GISEL-NEXT: ldp x10, x11, [x1, #16]
+; GISEL-NEXT: ldp x12, x13, [x1, #48]
+; GISEL-NEXT: extr x8, x9, x8, #15
+; GISEL-NEXT: ldp x14, x15, [x1, #32]
+; GISEL-NEXT: extr x9, x10, x9, #15
+; GISEL-NEXT: extr x10, x11, x10, #15
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: asr x8, x13, #63
+; GISEL-NEXT: extr x11, x14, x11, #15
+; GISEL-NEXT: extr x9, x15, x14, #15
+; GISEL-NEXT: lsl x8, x8, #49
+; GISEL-NEXT: stp x10, x11, [x0, #16]
+; GISEL-NEXT: extr x10, x12, x15, #15
+; GISEL-NEXT: extr x11, x13, x12, #15
+; GISEL-NEXT: orr x8, x8, x13, asr #15
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: stp x11, x8, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5738,28 +5651,21 @@ define void @test_shl_i512_const_63(ptr %result, ptr %input) {
; GISEL-LABEL: test_shl_i512_const_63:
; GISEL: ; %bb.0: ; %entry
; GISEL-NEXT: ldp x8, x9, [x1]
-; GISEL-NEXT: ldp x11, x12, [x1, #16]
-; GISEL-NEXT: ldp x14, x15, [x1, #32]
-; GISEL-NEXT: lsr x10, x8, #1
-; GISEL-NEXT: lsr x13, x9, #1
-; GISEL-NEXT: lsl x8, x8, #63
-; GISEL-NEXT: orr x9, x10, x9, lsl #63
-; GISEL-NEXT: lsr x10, x11, #1
-; GISEL-NEXT: orr x11, x13, x11, lsl #63
-; GISEL-NEXT: ldp x13, x16, [x1, #48]
-; GISEL-NEXT: stp x8, x9, [x0]
-; GISEL-NEXT: lsr x8, x12, #1
-; GISEL-NEXT: orr x10, x10, x12, lsl #63
-; GISEL-NEXT: lsr x12, x14, #1
-; GISEL-NEXT: lsr x9, x15, #1
-; GISEL-NEXT: orr x8, x8, x14, lsl #63
-; GISEL-NEXT: stp x11, x10, [x0, #16]
-; GISEL-NEXT: orr x11, x12, x15, lsl #63
-; GISEL-NEXT: lsr x12, x13, #1
-; GISEL-NEXT: orr x9, x9, x13, lsl #63
-; GISEL-NEXT: stp x8, x11, [x0, #32]
-; GISEL-NEXT: orr x8, x12, x16, lsl #63
-; GISEL-NEXT: stp x9, x8, [x0, #48]
+; GISEL-NEXT: ldp x10, x11, [x1, #16]
+; GISEL-NEXT: ldp x13, x14, [x1, #32]
+; GISEL-NEXT: lsl x12, x8, #63
+; GISEL-NEXT: extr x8, x9, x8, #1
+; GISEL-NEXT: extr x9, x10, x9, #1
+; GISEL-NEXT: extr x10, x11, x10, #1
+; GISEL-NEXT: ldp x15, x16, [x1, #48]
+; GISEL-NEXT: stp x12, x8, [x0]
+; GISEL-NEXT: extr x8, x13, x11, #1
+; GISEL-NEXT: stp x9, x10, [x0, #16]
+; GISEL-NEXT: extr x9, x14, x13, #1
+; GISEL-NEXT: extr x10, x15, x14, #1
+; GISEL-NEXT: stp x8, x9, [x0, #32]
+; GISEL-NEXT: extr x8, x16, x15, #1
+; GISEL-NEXT: stp x10, x8, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5791,30 +5697,22 @@ define void @test_lshr_i512_const_63(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_lshr_i512_const_63:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #8]
-; GISEL-NEXT: ldr x11, [x1]
-; GISEL-NEXT: ldp x10, x14, [x1, #24]
-; GISEL-NEXT: ldr x16, [x1, #56]
-; GISEL-NEXT: lsl x12, x8, #1
-; GISEL-NEXT: lsl x13, x9, #1
-; GISEL-NEXT: lsl x15, x10, #1
-; GISEL-NEXT: orr x11, x12, x11, lsr #63
-; GISEL-NEXT: orr x8, x13, x8, lsr #63
-; GISEL-NEXT: lsl x13, x14, #1
-; GISEL-NEXT: orr x9, x15, x9, lsr #63
-; GISEL-NEXT: ldp x12, x15, [x1, #40]
-; GISEL-NEXT: stp x11, x8, [x0]
-; GISEL-NEXT: orr x10, x13, x10, lsr #63
-; GISEL-NEXT: lsl x8, x16, #1
-; GISEL-NEXT: lsl x11, x12, #1
-; GISEL-NEXT: lsl x13, x15, #1
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: orr x8, x8, x15, lsr #63
-; GISEL-NEXT: lsr x10, x16, #63
-; GISEL-NEXT: orr x11, x11, x14, lsr #63
-; GISEL-NEXT: orr x9, x13, x12, lsr #63
-; GISEL-NEXT: stp x8, x10, [x0, #48]
-; GISEL-NEXT: stp x11, x9, [x0, #32]
+; GISEL-NEXT: ldp x8, x9, [x1]
+; GISEL-NEXT: ldp x10, x11, [x1, #16]
+; GISEL-NEXT: ldp x12, x13, [x1, #32]
+; GISEL-NEXT: extr x8, x9, x8, #63
+; GISEL-NEXT: ldp x14, x15, [x1, #48]
+; GISEL-NEXT: extr x9, x10, x9, #63
+; GISEL-NEXT: extr x10, x11, x10, #63
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x12, x11, #63
+; GISEL-NEXT: extr x9, x13, x12, #63
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: extr x10, x14, x13, #63
+; GISEL-NEXT: extr x8, x15, x14, #63
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: lsr x9, x15, #63
+; GISEL-NEXT: stp x8, x9, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5846,30 +5744,22 @@ define void @test_ashr_i512_const_63(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_ashr_i512_const_63:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #8]
-; GISEL-NEXT: ldr x10, [x1]
-; GISEL-NEXT: ldp x11, x13, [x1, #24]
-; GISEL-NEXT: ldr x17, [x1, #56]
-; GISEL-NEXT: lsl x15, x9, #1
-; GISEL-NEXT: lsl x12, x8, #1
-; GISEL-NEXT: lsl x16, x11, #1
-; GISEL-NEXT: orr x8, x15, x8, lsr #63
-; GISEL-NEXT: lsl x15, x13, #1
-; GISEL-NEXT: orr x10, x12, x10, lsr #63
-; GISEL-NEXT: ldp x14, x12, [x1, #40]
-; GISEL-NEXT: orr x9, x16, x9, lsr #63
-; GISEL-NEXT: orr x11, x15, x11, lsr #63
-; GISEL-NEXT: stp x10, x8, [x0]
-; GISEL-NEXT: lsl x8, x17, #1
-; GISEL-NEXT: lsl x16, x14, #1
-; GISEL-NEXT: lsl x10, x12, #1
-; GISEL-NEXT: stp x9, x11, [x0, #16]
-; GISEL-NEXT: asr x9, x17, #63
-; GISEL-NEXT: orr x8, x8, x12, lsr #63
-; GISEL-NEXT: orr x13, x16, x13, lsr #63
-; GISEL-NEXT: orr x10, x10, x14, lsr #63
-; GISEL-NEXT: orr x9, x9, x9, lsl #1
-; GISEL-NEXT: stp x13, x10, [x0, #32]
+; GISEL-NEXT: ldp x8, x9, [x1]
+; GISEL-NEXT: ldp x10, x11, [x1, #16]
+; GISEL-NEXT: ldp x12, x13, [x1, #32]
+; GISEL-NEXT: extr x8, x9, x8, #63
+; GISEL-NEXT: ldp x14, x15, [x1, #48]
+; GISEL-NEXT: extr x9, x10, x9, #63
+; GISEL-NEXT: extr x10, x11, x10, #63
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x12, x11, #63
+; GISEL-NEXT: extr x9, x13, x12, #63
+; GISEL-NEXT: extr x11, x14, x13, #63
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: asr x10, x15, #63
+; GISEL-NEXT: extr x8, x15, x14, #63
+; GISEL-NEXT: stp x9, x11, [x0, #32]
+; GISEL-NEXT: orr x9, x10, x10, lsl #1
; GISEL-NEXT: stp x8, x9, [x0, #48]
; GISEL-NEXT: ret
entry:
@@ -5906,23 +5796,17 @@ define void @test_shl_i512_const_65(ptr %result, ptr %input) {
; GISEL-NEXT: ldr x15, [x1, #48]
; GISEL-NEXT: ldp x10, x11, [x1, #16]
; GISEL-NEXT: ldp x12, x13, [x1, #32]
-; GISEL-NEXT: lsr x14, x8, #63
-; GISEL-NEXT: lsr x16, x9, #63
-; GISEL-NEXT: lsl x8, x8, #1
-; GISEL-NEXT: orr x9, x14, x9, lsl #1
-; GISEL-NEXT: lsr x14, x10, #63
-; GISEL-NEXT: orr x10, x16, x10, lsl #1
-; GISEL-NEXT: stp xzr, x8, [x0]
-; GISEL-NEXT: lsr x8, x11, #63
-; GISEL-NEXT: orr x11, x14, x11, lsl #1
-; GISEL-NEXT: lsr x14, x12, #63
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: lsr x9, x13, #63
-; GISEL-NEXT: orr x8, x8, x12, lsl #1
-; GISEL-NEXT: orr x10, x14, x13, lsl #1
-; GISEL-NEXT: orr x9, x9, x15, lsl #1
-; GISEL-NEXT: stp x11, x8, [x0, #32]
-; GISEL-NEXT: stp x10, x9, [x0, #48]
+; GISEL-NEXT: lsl x14, x8, #1
+; GISEL-NEXT: extr x8, x9, x8, #63
+; GISEL-NEXT: extr x9, x10, x9, #63
+; GISEL-NEXT: extr x10, x11, x10, #63
+; GISEL-NEXT: stp xzr, x14, [x0]
+; GISEL-NEXT: stp x8, x9, [x0, #16]
+; GISEL-NEXT: extr x8, x12, x11, #63
+; GISEL-NEXT: extr x9, x13, x12, #63
+; GISEL-NEXT: stp x10, x8, [x0, #32]
+; GISEL-NEXT: extr x10, x15, x13, #63
+; GISEL-NEXT: stp x9, x10, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -5953,27 +5837,21 @@ define void @test_lshr_i512_const_65(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_lshr_i512_const_65:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #16]
-; GISEL-NEXT: ldr x10, [x1, #8]
-; GISEL-NEXT: ldp x11, x14, [x1, #32]
-; GISEL-NEXT: ldp x15, x16, [x1, #48]
-; GISEL-NEXT: lsl x12, x8, #63
-; GISEL-NEXT: lsl x13, x9, #63
-; GISEL-NEXT: orr x10, x12, x10, lsr #1
-; GISEL-NEXT: lsl x12, x11, #63
-; GISEL-NEXT: orr x8, x13, x8, lsr #1
-; GISEL-NEXT: lsl x13, x14, #63
-; GISEL-NEXT: orr x9, x12, x9, lsr #1
-; GISEL-NEXT: stp x10, x8, [x0]
-; GISEL-NEXT: lsl x10, x15, #63
-; GISEL-NEXT: orr x11, x13, x11, lsr #1
-; GISEL-NEXT: lsl x12, x16, #63
-; GISEL-NEXT: orr x8, x10, x14, lsr #1
-; GISEL-NEXT: lsr x10, x16, #1
-; GISEL-NEXT: stp x9, x11, [x0, #16]
-; GISEL-NEXT: orr x9, x12, x15, lsr #1
-; GISEL-NEXT: stp x10, xzr, [x0, #48]
-; GISEL-NEXT: stp x8, x9, [x0, #32]
+; GISEL-NEXT: ldp x8, x9, [x1, #8]
+; GISEL-NEXT: ldr x14, [x1, #56]
+; GISEL-NEXT: ldp x10, x11, [x1, #24]
+; GISEL-NEXT: ldp x12, x13, [x1, #40]
+; GISEL-NEXT: extr x8, x9, x8, #1
+; GISEL-NEXT: extr x9, x10, x9, #1
+; GISEL-NEXT: extr x10, x11, x10, #1
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x12, x11, #1
+; GISEL-NEXT: extr x9, x13, x12, #1
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: extr x10, x14, x13, #1
+; GISEL-NEXT: lsr x8, x14, #1
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: stp x8, xzr, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -6005,29 +5883,23 @@ define void @test_ashr_i512_const_65(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_ashr_i512_const_65:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #16]
-; GISEL-NEXT: ldr x11, [x1, #8]
-; GISEL-NEXT: ldp x10, x13, [x1, #32]
-; GISEL-NEXT: lsl x12, x8, #63
-; GISEL-NEXT: lsl x14, x9, #63
-; GISEL-NEXT: lsl x15, x10, #63
-; GISEL-NEXT: orr x11, x12, x11, lsr #1
-; GISEL-NEXT: ldp x12, x16, [x1, #48]
-; GISEL-NEXT: orr x8, x14, x8, lsr #1
-; GISEL-NEXT: lsl x14, x13, #63
-; GISEL-NEXT: orr x9, x15, x9, lsr #1
-; GISEL-NEXT: asr x15, x16, #63
-; GISEL-NEXT: stp x11, x8, [x0]
-; GISEL-NEXT: lsl x11, x12, #63
-; GISEL-NEXT: orr x10, x14, x10, lsr #1
-; GISEL-NEXT: lsl x14, x16, #63
-; GISEL-NEXT: orr x8, x11, x13, lsr #1
+; GISEL-NEXT: ldp x8, x9, [x1, #8]
+; GISEL-NEXT: ldr x13, [x1, #40]
+; GISEL-NEXT: ldp x10, x11, [x1, #24]
+; GISEL-NEXT: ldp x14, x12, [x1, #48]
+; GISEL-NEXT: extr x8, x9, x8, #1
+; GISEL-NEXT: extr x9, x10, x9, #1
+; GISEL-NEXT: extr x10, x11, x10, #1
+; GISEL-NEXT: asr x15, x12, #63
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x13, x11, #1
+; GISEL-NEXT: extr x9, x14, x13, #1
; GISEL-NEXT: lsl x11, x15, #63
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: orr x9, x14, x12, lsr #1
-; GISEL-NEXT: orr x10, x11, x16, asr #1
-; GISEL-NEXT: stp x8, x9, [x0, #32]
-; GISEL-NEXT: stp x10, x15, [x0, #48]
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: extr x10, x12, x14, #1
+; GISEL-NEXT: orr x8, x11, x12, asr #1
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: stp x8, x15, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -6062,23 +5934,17 @@ define void @test_shl_i512_const_100(ptr %result, ptr %input) {
; GISEL-NEXT: ldr x15, [x1, #48]
; GISEL-NEXT: ldp x10, x11, [x1, #16]
; GISEL-NEXT: ldp x12, x13, [x1, #32]
-; GISEL-NEXT: lsr x14, x8, #28
-; GISEL-NEXT: lsr x16, x9, #28
-; GISEL-NEXT: lsl x8, x8, #36
-; GISEL-NEXT: orr x9, x14, x9, lsl #36
-; GISEL-NEXT: lsr x14, x10, #28
-; GISEL-NEXT: orr x10, x16, x10, lsl #36
-; GISEL-NEXT: stp xzr, x8, [x0]
-; GISEL-NEXT: lsr x8, x11, #28
-; GISEL-NEXT: orr x11, x14, x11, lsl #36
-; GISEL-NEXT: lsr x14, x12, #28
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: lsr x9, x13, #28
-; GISEL-NEXT: orr x8, x8, x12, lsl #36
-; GISEL-NEXT: orr x10, x14, x13, lsl #36
-; GISEL-NEXT: orr x9, x9, x15, lsl #36
-; GISEL-NEXT: stp x11, x8, [x0, #32]
-; GISEL-NEXT: stp x10, x9, [x0, #48]
+; GISEL-NEXT: lsl x14, x8, #36
+; GISEL-NEXT: extr x8, x9, x8, #28
+; GISEL-NEXT: extr x9, x10, x9, #28
+; GISEL-NEXT: extr x10, x11, x10, #28
+; GISEL-NEXT: stp xzr, x14, [x0]
+; GISEL-NEXT: stp x8, x9, [x0, #16]
+; GISEL-NEXT: extr x8, x12, x11, #28
+; GISEL-NEXT: extr x9, x13, x12, #28
+; GISEL-NEXT: stp x10, x8, [x0, #32]
+; GISEL-NEXT: extr x10, x15, x13, #28
+; GISEL-NEXT: stp x9, x10, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -6109,27 +5975,21 @@ define void @test_lshr_i512_const_100(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_lshr_i512_const_100:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #16]
-; GISEL-NEXT: ldr x10, [x1, #8]
-; GISEL-NEXT: ldp x11, x14, [x1, #32]
-; GISEL-NEXT: ldp x15, x16, [x1, #48]
-; GISEL-NEXT: lsl x12, x8, #28
-; GISEL-NEXT: lsl x13, x9, #28
-; GISEL-NEXT: orr x10, x12, x10, lsr #36
-; GISEL-NEXT: lsl x12, x11, #28
-; GISEL-NEXT: orr x8, x13, x8, lsr #36
-; GISEL-NEXT: lsl x13, x14, #28
-; GISEL-NEXT: orr x9, x12, x9, lsr #36
-; GISEL-NEXT: stp x10, x8, [x0]
-; GISEL-NEXT: lsl x10, x15, #28
-; GISEL-NEXT: orr x11, x13, x11, lsr #36
-; GISEL-NEXT: lsl x12, x16, #28
-; GISEL-NEXT: orr x8, x10, x14, lsr #36
-; GISEL-NEXT: lsr x10, x16, #36
-; GISEL-NEXT: stp x9, x11, [x0, #16]
-; GISEL-NEXT: orr x9, x12, x15, lsr #36
-; GISEL-NEXT: stp x10, xzr, [x0, #48]
-; GISEL-NEXT: stp x8, x9, [x0, #32]
+; GISEL-NEXT: ldp x8, x9, [x1, #8]
+; GISEL-NEXT: ldr x14, [x1, #56]
+; GISEL-NEXT: ldp x10, x11, [x1, #24]
+; GISEL-NEXT: ldp x12, x13, [x1, #40]
+; GISEL-NEXT: extr x8, x9, x8, #36
+; GISEL-NEXT: extr x9, x10, x9, #36
+; GISEL-NEXT: extr x10, x11, x10, #36
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x12, x11, #36
+; GISEL-NEXT: extr x9, x13, x12, #36
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: extr x10, x14, x13, #36
+; GISEL-NEXT: lsr x8, x14, #36
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: stp x8, xzr, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -6161,29 +6021,23 @@ define void @test_ashr_i512_const_100(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_ashr_i512_const_100:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #16]
-; GISEL-NEXT: ldr x11, [x1, #8]
-; GISEL-NEXT: ldp x10, x13, [x1, #32]
-; GISEL-NEXT: lsl x12, x8, #28
-; GISEL-NEXT: lsl x14, x9, #28
-; GISEL-NEXT: lsl x15, x10, #28
-; GISEL-NEXT: orr x11, x12, x11, lsr #36
-; GISEL-NEXT: ldp x12, x16, [x1, #48]
-; GISEL-NEXT: orr x8, x14, x8, lsr #36
-; GISEL-NEXT: lsl x14, x13, #28
-; GISEL-NEXT: orr x9, x15, x9, lsr #36
-; GISEL-NEXT: asr x15, x16, #63
-; GISEL-NEXT: stp x11, x8, [x0]
-; GISEL-NEXT: lsl x11, x12, #28
-; GISEL-NEXT: orr x10, x14, x10, lsr #36
-; GISEL-NEXT: lsl x14, x16, #28
-; GISEL-NEXT: orr x8, x11, x13, lsr #36
+; GISEL-NEXT: ldp x8, x9, [x1, #8]
+; GISEL-NEXT: ldr x13, [x1, #40]
+; GISEL-NEXT: ldp x10, x11, [x1, #24]
+; GISEL-NEXT: ldp x14, x12, [x1, #48]
+; GISEL-NEXT: extr x8, x9, x8, #36
+; GISEL-NEXT: extr x9, x10, x9, #36
+; GISEL-NEXT: extr x10, x11, x10, #36
+; GISEL-NEXT: asr x15, x12, #63
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x13, x11, #36
+; GISEL-NEXT: extr x9, x14, x13, #36
; GISEL-NEXT: lsl x11, x15, #28
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: orr x9, x14, x12, lsr #36
-; GISEL-NEXT: orr x10, x11, x16, asr #36
-; GISEL-NEXT: stp x8, x9, [x0, #32]
-; GISEL-NEXT: stp x10, x15, [x0, #48]
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: extr x10, x12, x14, #36
+; GISEL-NEXT: orr x8, x11, x12, asr #36
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: stp x8, x15, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -6219,23 +6073,17 @@ define void @test_shl_i512_const_127(ptr %result, ptr %input) {
; GISEL-NEXT: ldr x15, [x1, #48]
; GISEL-NEXT: ldp x10, x11, [x1, #16]
; GISEL-NEXT: ldp x12, x13, [x1, #32]
-; GISEL-NEXT: lsr x14, x8, #1
-; GISEL-NEXT: lsr x16, x9, #1
-; GISEL-NEXT: lsl x8, x8, #63
-; GISEL-NEXT: orr x9, x14, x9, lsl #63
-; GISEL-NEXT: lsr x14, x10, #1
-; GISEL-NEXT: orr x10, x16, x10, lsl #63
-; GISEL-NEXT: stp xzr, x8, [x0]
-; GISEL-NEXT: lsr x8, x11, #1
-; GISEL-NEXT: orr x11, x14, x11, lsl #63
-; GISEL-NEXT: lsr x14, x12, #1
-; GISEL-NEXT: stp x9, x10, [x0, #16]
-; GISEL-NEXT: lsr x9, x13, #1
-; GISEL-NEXT: orr x8, x8, x12, lsl #63
-; GISEL-NEXT: orr x10, x14, x13, lsl #63
-; GISEL-NEXT: orr x9, x9, x15, lsl #63
-; GISEL-NEXT: stp x11, x8, [x0, #32]
-; GISEL-NEXT: stp x10, x9, [x0, #48]
+; GISEL-NEXT: lsl x14, x8, #63
+; GISEL-NEXT: extr x8, x9, x8, #1
+; GISEL-NEXT: extr x9, x10, x9, #1
+; GISEL-NEXT: extr x10, x11, x10, #1
+; GISEL-NEXT: stp xzr, x14, [x0]
+; GISEL-NEXT: stp x8, x9, [x0, #16]
+; GISEL-NEXT: extr x8, x12, x11, #1
+; GISEL-NEXT: extr x9, x13, x12, #1
+; GISEL-NEXT: stp x10, x8, [x0, #32]
+; GISEL-NEXT: extr x10, x15, x13, #1
+; GISEL-NEXT: stp x9, x10, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -6266,27 +6114,21 @@ define void @test_lshr_i512_const_127(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_lshr_i512_const_127:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #16]
-; GISEL-NEXT: ldr x10, [x1, #8]
-; GISEL-NEXT: ldp x11, x14, [x1, #32]
-; GISEL-NEXT: ldp x15, x16, [x1, #48]
-; GISEL-NEXT: lsl x12, x8, #1
-; GISEL-NEXT: lsl x13, x9, #1
-; GISEL-NEXT: orr x10, x12, x10, lsr #63
-; GISEL-NEXT: lsl x12, x11, #1
-; GISEL-NEXT: orr x8, x13, x8, lsr #63
-; GISEL-NEXT: lsl x13, x14, #1
-; GISEL-NEXT: orr x9, x12, x9, lsr #63
-; GISEL-NEXT: stp x10, x8, [x0]
-; GISEL-NEXT: lsl x10, x15, #1
-; GISEL-NEXT: orr x11, x13, x11, lsr #63
-; GISEL-NEXT: lsl x12, x16, #1
-; GISEL-NEXT: orr x8, x10, x14, lsr #63
-; GISEL-NEXT: lsr x10, x16, #63
-; GISEL-NEXT: stp x9, x11, [x0, #16]
-; GISEL-NEXT: orr x9, x12, x15, lsr #63
-; GISEL-NEXT: stp x10, xzr, [x0, #48]
-; GISEL-NEXT: stp x8, x9, [x0, #32]
+; GISEL-NEXT: ldp x8, x9, [x1, #8]
+; GISEL-NEXT: ldr x14, [x1, #56]
+; GISEL-NEXT: ldp x10, x11, [x1, #24]
+; GISEL-NEXT: ldp x12, x13, [x1, #40]
+; GISEL-NEXT: extr x8, x9, x8, #63
+; GISEL-NEXT: extr x9, x10, x9, #63
+; GISEL-NEXT: extr x10, x11, x10, #63
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x12, x11, #63
+; GISEL-NEXT: extr x9, x13, x12, #63
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: extr x10, x14, x13, #63
+; GISEL-NEXT: lsr x8, x14, #63
+; GISEL-NEXT: stp x9, x10, [x0, #32]
+; GISEL-NEXT: stp x8, xzr, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
@@ -6317,28 +6159,22 @@ define void @test_ashr_i512_const_127(ptr %result, ptr %input) {
;
; GISEL-LABEL: test_ashr_i512_const_127:
; GISEL: ; %bb.0: ; %entry
-; GISEL-NEXT: ldp x8, x9, [x1, #16]
-; GISEL-NEXT: ldr x10, [x1, #8]
-; GISEL-NEXT: ldp x11, x14, [x1, #32]
-; GISEL-NEXT: ldp x15, x16, [x1, #48]
-; GISEL-NEXT: lsl x12, x8, #1
-; GISEL-NEXT: lsl x13, x9, #1
-; GISEL-NEXT: orr x10, x12, x10, lsr #63
-; GISEL-NEXT: lsl x12, x11, #1
-; GISEL-NEXT: orr x8, x13, x8, lsr #63
-; GISEL-NEXT: lsl x13, x14, #1
-; GISEL-NEXT: orr x9, x12, x9, lsr #63
-; GISEL-NEXT: lsl x12, x15, #1
-; GISEL-NEXT: stp x10, x8, [x0]
-; GISEL-NEXT: lsl x10, x16, #1
-; GISEL-NEXT: orr x11, x13, x11, lsr #63
-; GISEL-NEXT: asr x8, x16, #63
-; GISEL-NEXT: orr x12, x12, x14, lsr #63
-; GISEL-NEXT: stp x9, x11, [x0, #16]
-; GISEL-NEXT: orr x9, x10, x15, lsr #63
-; GISEL-NEXT: orr x10, x8, x8, lsl #1
-; GISEL-NEXT: stp x12, x9, [x0, #32]
-; GISEL-NEXT: stp x10, x8, [x0, #48]
+; GISEL-NEXT: ldp x8, x9, [x1, #8]
+; GISEL-NEXT: ldr x14, [x1, #56]
+; GISEL-NEXT: ldp x10, x11, [x1, #24]
+; GISEL-NEXT: ldp x12, x13, [x1, #40]
+; GISEL-NEXT: extr x8, x9, x8, #63
+; GISEL-NEXT: extr x9, x10, x9, #63
+; GISEL-NEXT: extr x10, x11, x10, #63
+; GISEL-NEXT: stp x8, x9, [x0]
+; GISEL-NEXT: extr x8, x12, x11, #63
+; GISEL-NEXT: asr x9, x14, #63
+; GISEL-NEXT: extr x11, x13, x12, #63
+; GISEL-NEXT: stp x10, x8, [x0, #16]
+; GISEL-NEXT: extr x10, x14, x13, #63
+; GISEL-NEXT: orr x8, x9, x9, lsl #1
+; GISEL-NEXT: stp x11, x10, [x0, #32]
+; GISEL-NEXT: stp x8, x9, [x0, #48]
; GISEL-NEXT: ret
entry:
%input_val = load i512, ptr %input, align 64
diff --git a/llvm/test/CodeGen/AArch64/adc.ll b/llvm/test/CodeGen/AArch64/adc.ll
index 12e8bf2..03f3cf1 100644
--- a/llvm/test/CodeGen/AArch64/adc.ll
+++ b/llvm/test/CodeGen/AArch64/adc.ll
@@ -71,9 +71,8 @@ define i128 @test_shifted(i128 %a, i128 %b) {
;
; CHECK-GI-LABEL: test_shifted:
; CHECK-GI: ; %bb.0:
-; CHECK-GI-NEXT: lsr x8, x2, #19
+; CHECK-GI-NEXT: extr x8, x3, x2, #19
; CHECK-GI-NEXT: adds x0, x0, x2, lsl #45
-; CHECK-GI-NEXT: orr x8, x8, x3, lsl #45
; CHECK-GI-NEXT: adc x1, x1, x8
; CHECK-GI-NEXT: ret
%rhs = shl i128 %b, 45
@@ -108,8 +107,7 @@ define i128 @test_extended(i128 %a, i16 %b) {
; CHECK-GI-NEXT: sxth x8, w2
; CHECK-GI-NEXT: adds x0, x0, w2, sxth #3
; CHECK-GI-NEXT: asr x9, x8, #63
-; CHECK-GI-NEXT: lsr x8, x8, #61
-; CHECK-GI-NEXT: orr x8, x8, x9, lsl #3
+; CHECK-GI-NEXT: extr x8, x9, x8, #61
; CHECK-GI-NEXT: adc x1, x1, x8
; CHECK-GI-NEXT: ret
%ext = sext i16 %b to i128
diff --git a/llvm/test/CodeGen/AArch64/fsh.ll b/llvm/test/CodeGen/AArch64/fsh.ll
index 765f6b7..7f07ef4 100644
--- a/llvm/test/CodeGen/AArch64/fsh.ll
+++ b/llvm/test/CodeGen/AArch64/fsh.ll
@@ -510,41 +510,40 @@ define i128 @fshl_i128(i128 %a, i128 %b, i128 %c) {
;
; CHECK-GI-LABEL: fshl_i128:
; CHECK-GI: // %bb.0: // %entry
+; CHECK-GI-NEXT: mov w8, #64 // =0x40
; CHECK-GI-NEXT: and x9, x4, #0x7f
-; CHECK-GI-NEXT: mov w10, #64 // =0x40
-; CHECK-GI-NEXT: lsl x14, x3, #63
-; CHECK-GI-NEXT: sub x12, x10, x9
+; CHECK-GI-NEXT: mov w10, #127 // =0x7f
+; CHECK-GI-NEXT: sub x12, x8, x9
; CHECK-GI-NEXT: lsl x13, x1, x9
-; CHECK-GI-NEXT: mov w8, #127 // =0x7f
+; CHECK-GI-NEXT: bic x10, x10, x4
; CHECK-GI-NEXT: lsr x12, x0, x12
-; CHECK-GI-NEXT: bic x8, x8, x4
-; CHECK-GI-NEXT: sub x15, x9, #64
+; CHECK-GI-NEXT: sub x14, x9, #64
+; CHECK-GI-NEXT: lsl x15, x0, x9
+; CHECK-GI-NEXT: extr x16, x3, x2, #1
; CHECK-GI-NEXT: cmp x9, #64
-; CHECK-GI-NEXT: lsl x9, x0, x9
-; CHECK-GI-NEXT: lsl x15, x0, x15
-; CHECK-GI-NEXT: orr x12, x12, x13
-; CHECK-GI-NEXT: orr x13, x14, x2, lsr #1
-; CHECK-GI-NEXT: lsr x14, x3, #1
-; CHECK-GI-NEXT: sub x10, x10, x8
-; CHECK-GI-NEXT: sub x16, x8, #64
-; CHECK-GI-NEXT: csel x9, x9, xzr, lo
-; CHECK-GI-NEXT: lsr x17, x13, x8
-; CHECK-GI-NEXT: lsl x10, x14, x10
-; CHECK-GI-NEXT: csel x12, x12, x15, lo
+; CHECK-GI-NEXT: sub x8, x8, x10
+; CHECK-GI-NEXT: orr x9, x12, x13
+; CHECK-GI-NEXT: lsr x12, x3, #1
+; CHECK-GI-NEXT: lsl x13, x0, x14
+; CHECK-GI-NEXT: csel x14, x15, xzr, lo
+; CHECK-GI-NEXT: sub x15, x10, #64
+; CHECK-GI-NEXT: lsr x17, x16, x10
+; CHECK-GI-NEXT: lsl x8, x12, x8
+; CHECK-GI-NEXT: csel x9, x9, x13, lo
; CHECK-GI-NEXT: tst x4, #0x7f
-; CHECK-GI-NEXT: lsr x15, x14, x16
+; CHECK-GI-NEXT: lsr x13, x12, x15
; CHECK-GI-NEXT: mvn x11, x4
-; CHECK-GI-NEXT: csel x12, x1, x12, eq
-; CHECK-GI-NEXT: orr x10, x17, x10
-; CHECK-GI-NEXT: cmp x8, #64
-; CHECK-GI-NEXT: lsr x14, x14, x8
-; CHECK-GI-NEXT: csel x10, x10, x15, lo
+; CHECK-GI-NEXT: csel x9, x1, x9, eq
+; CHECK-GI-NEXT: orr x8, x17, x8
+; CHECK-GI-NEXT: cmp x10, #64
+; CHECK-GI-NEXT: lsr x12, x12, x10
+; CHECK-GI-NEXT: csel x8, x8, x13, lo
; CHECK-GI-NEXT: tst x11, #0x7f
-; CHECK-GI-NEXT: csel x10, x13, x10, eq
-; CHECK-GI-NEXT: cmp x8, #64
-; CHECK-GI-NEXT: csel x8, x14, xzr, lo
-; CHECK-GI-NEXT: orr x0, x9, x10
-; CHECK-GI-NEXT: orr x1, x12, x8
+; CHECK-GI-NEXT: csel x8, x16, x8, eq
+; CHECK-GI-NEXT: cmp x10, #64
+; CHECK-GI-NEXT: csel x10, x12, xzr, lo
+; CHECK-GI-NEXT: orr x0, x14, x8
+; CHECK-GI-NEXT: orr x1, x9, x10
; CHECK-GI-NEXT: ret
entry:
%d = call i128 @llvm.fshl(i128 %a, i128 %b, i128 %c)
@@ -571,41 +570,40 @@ define i128 @fshr_i128(i128 %a, i128 %b, i128 %c) {
;
; CHECK-GI-LABEL: fshr_i128:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: lsr x8, x0, #63
-; CHECK-GI-NEXT: mov w9, #127 // =0x7f
-; CHECK-GI-NEXT: mov w10, #64 // =0x40
-; CHECK-GI-NEXT: bic x9, x9, x4
-; CHECK-GI-NEXT: lsl x11, x0, #1
-; CHECK-GI-NEXT: and x12, x4, #0x7f
-; CHECK-GI-NEXT: orr x8, x8, x1, lsl #1
-; CHECK-GI-NEXT: sub x14, x10, x9
-; CHECK-GI-NEXT: sub x17, x9, #64
-; CHECK-GI-NEXT: lsl x15, x11, x9
-; CHECK-GI-NEXT: lsr x14, x11, x14
-; CHECK-GI-NEXT: cmp x9, #64
-; CHECK-GI-NEXT: lsl x16, x8, x9
-; CHECK-GI-NEXT: sub x9, x10, x12
-; CHECK-GI-NEXT: lsl x10, x11, x17
-; CHECK-GI-NEXT: mvn x13, x4
-; CHECK-GI-NEXT: csel x11, x15, xzr, lo
-; CHECK-GI-NEXT: sub x15, x12, #64
-; CHECK-GI-NEXT: orr x14, x14, x16
-; CHECK-GI-NEXT: lsr x16, x2, x12
-; CHECK-GI-NEXT: lsl x9, x3, x9
-; CHECK-GI-NEXT: csel x10, x14, x10, lo
-; CHECK-GI-NEXT: tst x13, #0x7f
-; CHECK-GI-NEXT: lsr x13, x3, x15
-; CHECK-GI-NEXT: csel x8, x8, x10, eq
-; CHECK-GI-NEXT: orr x9, x16, x9
-; CHECK-GI-NEXT: cmp x12, #64
-; CHECK-GI-NEXT: lsr x10, x3, x12
-; CHECK-GI-NEXT: csel x9, x9, x13, lo
+; CHECK-GI-NEXT: mov w8, #127 // =0x7f
+; CHECK-GI-NEXT: lsl x9, x0, #1
+; CHECK-GI-NEXT: extr x10, x1, x0, #63
+; CHECK-GI-NEXT: bic x8, x8, x4
+; CHECK-GI-NEXT: mov w11, #64 // =0x40
+; CHECK-GI-NEXT: and x14, x4, #0x7f
+; CHECK-GI-NEXT: sub x12, x11, x8
+; CHECK-GI-NEXT: lsl x13, x10, x8
+; CHECK-GI-NEXT: lsl x16, x9, x8
+; CHECK-GI-NEXT: lsr x12, x9, x12
+; CHECK-GI-NEXT: sub x17, x8, #64
+; CHECK-GI-NEXT: cmp x8, #64
+; CHECK-GI-NEXT: lsl x8, x9, x17
+; CHECK-GI-NEXT: sub x11, x11, x14
+; CHECK-GI-NEXT: mvn x15, x4
+; CHECK-GI-NEXT: orr x12, x12, x13
+; CHECK-GI-NEXT: csel x9, x16, xzr, lo
+; CHECK-GI-NEXT: sub x13, x14, #64
+; CHECK-GI-NEXT: lsr x16, x2, x14
+; CHECK-GI-NEXT: lsl x11, x3, x11
+; CHECK-GI-NEXT: csel x8, x12, x8, lo
+; CHECK-GI-NEXT: tst x15, #0x7f
+; CHECK-GI-NEXT: lsr x12, x3, x13
+; CHECK-GI-NEXT: csel x8, x10, x8, eq
+; CHECK-GI-NEXT: orr x10, x16, x11
+; CHECK-GI-NEXT: cmp x14, #64
+; CHECK-GI-NEXT: lsr x11, x3, x14
+; CHECK-GI-NEXT: csel x10, x10, x12, lo
; CHECK-GI-NEXT: tst x4, #0x7f
-; CHECK-GI-NEXT: csel x9, x2, x9, eq
-; CHECK-GI-NEXT: cmp x12, #64
-; CHECK-GI-NEXT: csel x10, x10, xzr, lo
-; CHECK-GI-NEXT: orr x0, x11, x9
-; CHECK-GI-NEXT: orr x1, x8, x10
+; CHECK-GI-NEXT: csel x10, x2, x10, eq
+; CHECK-GI-NEXT: cmp x14, #64
+; CHECK-GI-NEXT: csel x11, x11, xzr, lo
+; CHECK-GI-NEXT: orr x0, x9, x10
+; CHECK-GI-NEXT: orr x1, x8, x11
; CHECK-GI-NEXT: ret
entry:
%d = call i128 @llvm.fshr(i128 %a, i128 %b, i128 %c)
@@ -720,10 +718,9 @@ define i128 @rotl_i128_c(i128 %a) {
;
; CHECK-GI-LABEL: rotl_i128_c:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: lsr x8, x0, #61
-; CHECK-GI-NEXT: lsr x9, x1, #61
-; CHECK-GI-NEXT: orr x1, x8, x1, lsl #3
-; CHECK-GI-NEXT: orr x0, x9, x0, lsl #3
+; CHECK-GI-NEXT: extr x8, x1, x0, #61
+; CHECK-GI-NEXT: extr x0, x0, x1, #61
+; CHECK-GI-NEXT: mov x1, x8
; CHECK-GI-NEXT: ret
entry:
%d = call i128 @llvm.fshl(i128 %a, i128 %a, i128 3)
@@ -731,20 +728,12 @@ entry:
}
define i128 @rotr_i128_c(i128 %a) {
-; CHECK-SD-LABEL: rotr_i128_c:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: extr x8, x1, x0, #3
-; CHECK-SD-NEXT: extr x1, x0, x1, #3
-; CHECK-SD-NEXT: mov x0, x8
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: rotr_i128_c:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: lsl x8, x1, #61
-; CHECK-GI-NEXT: lsl x9, x0, #61
-; CHECK-GI-NEXT: orr x0, x8, x0, lsr #3
-; CHECK-GI-NEXT: orr x1, x9, x1, lsr #3
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: rotr_i128_c:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: extr x8, x1, x0, #3
+; CHECK-NEXT: extr x1, x0, x1, #3
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: ret
entry:
%d = call i128 @llvm.fshr(i128 %a, i128 %a, i128 3)
ret i128 %d
@@ -868,10 +857,8 @@ define i128 @fshl_i128_c(i128 %a, i128 %b) {
;
; CHECK-GI-LABEL: fshl_i128_c:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: lsr x8, x0, #61
-; CHECK-GI-NEXT: lsr x9, x3, #61
-; CHECK-GI-NEXT: orr x1, x8, x1, lsl #3
-; CHECK-GI-NEXT: orr x0, x9, x0, lsl #3
+; CHECK-GI-NEXT: extr x1, x1, x0, #61
+; CHECK-GI-NEXT: extr x0, x0, x3, #61
; CHECK-GI-NEXT: ret
entry:
%d = call i128 @llvm.fshl(i128 %a, i128 %b, i128 3)
@@ -879,21 +866,12 @@ entry:
}
define i128 @fshr_i128_c(i128 %a, i128 %b) {
-; CHECK-SD-LABEL: fshr_i128_c:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: extr x8, x3, x2, #3
-; CHECK-SD-NEXT: extr x1, x0, x3, #3
-; CHECK-SD-NEXT: mov x0, x8
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: fshr_i128_c:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: lsl x8, x3, #61
-; CHECK-GI-NEXT: lsr x9, x3, #3
-; CHECK-GI-NEXT: orr x8, x8, x2, lsr #3
-; CHECK-GI-NEXT: orr x1, x9, x0, lsl #61
-; CHECK-GI-NEXT: mov x0, x8
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: fshr_i128_c:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: extr x8, x3, x2, #3
+; CHECK-NEXT: extr x1, x0, x3, #3
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: ret
entry:
%d = call i128 @llvm.fshr(i128 %a, i128 %b, i128 3)
ret i128 %d
@@ -3013,75 +2991,73 @@ define <2 x i128> @fshl_v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) {
; CHECK-GI-NEXT: .cfi_def_cfa_offset 16
; CHECK-GI-NEXT: .cfi_offset w19, -16
; CHECK-GI-NEXT: ldr x11, [sp, #16]
-; CHECK-GI-NEXT: mov w10, #64 // =0x40
+; CHECK-GI-NEXT: mov w9, #64 // =0x40
; CHECK-GI-NEXT: ldr x12, [sp, #32]
; CHECK-GI-NEXT: mov w13, #127 // =0x7f
-; CHECK-GI-NEXT: and x9, x11, #0x7f
+; CHECK-GI-NEXT: and x8, x11, #0x7f
; CHECK-GI-NEXT: and x14, x12, #0x7f
-; CHECK-GI-NEXT: mvn x15, x11
-; CHECK-GI-NEXT: sub x8, x10, x9
-; CHECK-GI-NEXT: sub x16, x9, #64
-; CHECK-GI-NEXT: lsl x19, x1, x9
-; CHECK-GI-NEXT: lsr x18, x0, x8
-; CHECK-GI-NEXT: lsl x17, x0, x9
-; CHECK-GI-NEXT: lsl x16, x0, x16
-; CHECK-GI-NEXT: cmp x9, #64
-; CHECK-GI-NEXT: bic x0, x13, x11
-; CHECK-GI-NEXT: mvn x8, x12
-; CHECK-GI-NEXT: orr x18, x18, x19
-; CHECK-GI-NEXT: csel x9, x17, xzr, lo
+; CHECK-GI-NEXT: mvn x18, x11
+; CHECK-GI-NEXT: sub x10, x9, x8
+; CHECK-GI-NEXT: sub x15, x8, #64
+; CHECK-GI-NEXT: lsl x17, x1, x8
+; CHECK-GI-NEXT: lsr x16, x0, x10
+; CHECK-GI-NEXT: lsl x15, x0, x15
+; CHECK-GI-NEXT: cmp x8, #64
+; CHECK-GI-NEXT: lsl x19, x0, x8
+; CHECK-GI-NEXT: lsl x0, x3, x14
+; CHECK-GI-NEXT: mvn x10, x12
+; CHECK-GI-NEXT: orr x16, x16, x17
; CHECK-GI-NEXT: sub x17, x14, #64
-; CHECK-GI-NEXT: csel x16, x18, x16, lo
+; CHECK-GI-NEXT: csel x15, x16, x15, lo
+; CHECK-GI-NEXT: sub x16, x9, x14
+; CHECK-GI-NEXT: csel x8, x19, xzr, lo
+; CHECK-GI-NEXT: lsr x16, x2, x16
; CHECK-GI-NEXT: tst x11, #0x7f
-; CHECK-GI-NEXT: sub x11, x10, x14
-; CHECK-GI-NEXT: lsr x11, x2, x11
-; CHECK-GI-NEXT: lsl x18, x3, x14
-; CHECK-GI-NEXT: csel x16, x1, x16, eq
-; CHECK-GI-NEXT: lsl x1, x2, x14
+; CHECK-GI-NEXT: lsl x19, x2, x14
; CHECK-GI-NEXT: lsl x17, x2, x17
+; CHECK-GI-NEXT: csel x15, x1, x15, eq
; CHECK-GI-NEXT: cmp x14, #64
-; CHECK-GI-NEXT: lsl x14, x5, #63
-; CHECK-GI-NEXT: orr x11, x11, x18
-; CHECK-GI-NEXT: bic x13, x13, x12
-; CHECK-GI-NEXT: csel x18, x1, xzr, lo
-; CHECK-GI-NEXT: csel x11, x11, x17, lo
+; CHECK-GI-NEXT: orr x16, x16, x0
+; CHECK-GI-NEXT: bic x11, x13, x11
+; CHECK-GI-NEXT: csel x14, x19, xzr, lo
+; CHECK-GI-NEXT: csel x16, x16, x17, lo
; CHECK-GI-NEXT: tst x12, #0x7f
-; CHECK-GI-NEXT: lsr x12, x5, #1
-; CHECK-GI-NEXT: orr x14, x14, x4, lsr #1
-; CHECK-GI-NEXT: lsl x17, x7, #63
-; CHECK-GI-NEXT: sub x1, x10, x0
-; CHECK-GI-NEXT: csel x11, x3, x11, eq
-; CHECK-GI-NEXT: sub x2, x0, #64
-; CHECK-GI-NEXT: lsr x3, x14, x0
-; CHECK-GI-NEXT: lsl x1, x12, x1
-; CHECK-GI-NEXT: lsr x4, x7, #1
-; CHECK-GI-NEXT: orr x17, x17, x6, lsr #1
-; CHECK-GI-NEXT: lsr x2, x12, x2
-; CHECK-GI-NEXT: cmp x0, #64
-; CHECK-GI-NEXT: orr x1, x3, x1
-; CHECK-GI-NEXT: sub x10, x10, x13
-; CHECK-GI-NEXT: lsr x12, x12, x0
-; CHECK-GI-NEXT: csel x1, x1, x2, lo
-; CHECK-GI-NEXT: tst x15, #0x7f
-; CHECK-GI-NEXT: sub x15, x13, #64
-; CHECK-GI-NEXT: lsr x2, x17, x13
-; CHECK-GI-NEXT: lsl x10, x4, x10
-; CHECK-GI-NEXT: csel x14, x14, x1, eq
-; CHECK-GI-NEXT: cmp x0, #64
-; CHECK-GI-NEXT: lsr x15, x4, x15
-; CHECK-GI-NEXT: lsr x0, x4, x13
-; CHECK-GI-NEXT: csel x12, x12, xzr, lo
-; CHECK-GI-NEXT: orr x10, x2, x10
-; CHECK-GI-NEXT: cmp x13, #64
-; CHECK-GI-NEXT: csel x10, x10, x15, lo
-; CHECK-GI-NEXT: tst x8, #0x7f
-; CHECK-GI-NEXT: orr x1, x16, x12
-; CHECK-GI-NEXT: csel x8, x17, x10, eq
-; CHECK-GI-NEXT: cmp x13, #64
-; CHECK-GI-NEXT: csel x10, x0, xzr, lo
-; CHECK-GI-NEXT: orr x0, x9, x14
-; CHECK-GI-NEXT: orr x2, x18, x8
-; CHECK-GI-NEXT: orr x3, x11, x10
+; CHECK-GI-NEXT: lsr x17, x5, #1
+; CHECK-GI-NEXT: extr x0, x5, x4, #1
+; CHECK-GI-NEXT: bic x12, x13, x12
+; CHECK-GI-NEXT: csel x13, x3, x16, eq
+; CHECK-GI-NEXT: sub x16, x9, x11
+; CHECK-GI-NEXT: sub x1, x11, #64
+; CHECK-GI-NEXT: lsr x3, x7, #1
+; CHECK-GI-NEXT: lsr x2, x0, x11
+; CHECK-GI-NEXT: lsl x16, x17, x16
+; CHECK-GI-NEXT: extr x4, x7, x6, #1
+; CHECK-GI-NEXT: lsr x1, x17, x1
+; CHECK-GI-NEXT: cmp x11, #64
+; CHECK-GI-NEXT: sub x9, x9, x12
+; CHECK-GI-NEXT: orr x16, x2, x16
+; CHECK-GI-NEXT: lsr x17, x17, x11
+; CHECK-GI-NEXT: lsl x9, x3, x9
+; CHECK-GI-NEXT: csel x16, x16, x1, lo
+; CHECK-GI-NEXT: tst x18, #0x7f
+; CHECK-GI-NEXT: sub x18, x12, #64
+; CHECK-GI-NEXT: lsr x1, x4, x12
+; CHECK-GI-NEXT: csel x16, x0, x16, eq
+; CHECK-GI-NEXT: cmp x11, #64
+; CHECK-GI-NEXT: lsr x11, x3, x18
+; CHECK-GI-NEXT: csel x17, x17, xzr, lo
+; CHECK-GI-NEXT: cmp x12, #64
+; CHECK-GI-NEXT: orr x9, x1, x9
+; CHECK-GI-NEXT: lsr x18, x3, x12
+; CHECK-GI-NEXT: orr x0, x8, x16
+; CHECK-GI-NEXT: csel x9, x9, x11, lo
+; CHECK-GI-NEXT: tst x10, #0x7f
+; CHECK-GI-NEXT: orr x1, x15, x17
+; CHECK-GI-NEXT: csel x9, x4, x9, eq
+; CHECK-GI-NEXT: cmp x12, #64
+; CHECK-GI-NEXT: csel x10, x18, xzr, lo
+; CHECK-GI-NEXT: orr x2, x14, x9
+; CHECK-GI-NEXT: orr x3, x13, x10
; CHECK-GI-NEXT: ldr x19, [sp], #16 // 8-byte Folded Reload
; CHECK-GI-NEXT: ret
entry:
@@ -3125,75 +3101,73 @@ define <2 x i128> @fshr_v2i128(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c) {
; CHECK-GI-LABEL: fshr_v2i128:
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: ldr x9, [sp]
-; CHECK-GI-NEXT: lsl x12, x1, #1
-; CHECK-GI-NEXT: mov w11, #127 // =0x7f
-; CHECK-GI-NEXT: mov w14, #64 // =0x40
-; CHECK-GI-NEXT: lsl x15, x0, #1
+; CHECK-GI-NEXT: mov w10, #127 // =0x7f
+; CHECK-GI-NEXT: mov w12, #64 // =0x40
+; CHECK-GI-NEXT: lsl x13, x0, #1
+; CHECK-GI-NEXT: extr x14, x1, x0, #63
; CHECK-GI-NEXT: ldr x8, [sp, #16]
-; CHECK-GI-NEXT: bic x13, x11, x9
-; CHECK-GI-NEXT: orr x12, x12, x0, lsr #63
-; CHECK-GI-NEXT: lsl x1, x3, #1
-; CHECK-GI-NEXT: sub x17, x14, x13
-; CHECK-GI-NEXT: sub x18, x13, #64
-; CHECK-GI-NEXT: lsl x3, x15, x13
-; CHECK-GI-NEXT: lsr x17, x15, x17
-; CHECK-GI-NEXT: lsl x0, x12, x13
-; CHECK-GI-NEXT: lsl x15, x15, x18
-; CHECK-GI-NEXT: bic x11, x11, x8
+; CHECK-GI-NEXT: bic x11, x10, x9
+; CHECK-GI-NEXT: mvn x16, x9
+; CHECK-GI-NEXT: and x15, x9, #0x7f
+; CHECK-GI-NEXT: sub x17, x12, x11
+; CHECK-GI-NEXT: sub x18, x11, #64
+; CHECK-GI-NEXT: lsl x0, x14, x11
+; CHECK-GI-NEXT: lsr x17, x13, x17
+; CHECK-GI-NEXT: lsl x1, x13, x11
+; CHECK-GI-NEXT: lsl x13, x13, x18
+; CHECK-GI-NEXT: bic x10, x10, x8
; CHECK-GI-NEXT: lsl x18, x2, #1
-; CHECK-GI-NEXT: cmp x13, #64
+; CHECK-GI-NEXT: cmp x11, #64
; CHECK-GI-NEXT: orr x17, x17, x0
-; CHECK-GI-NEXT: orr x13, x1, x2, lsr #63
-; CHECK-GI-NEXT: mvn x16, x9
-; CHECK-GI-NEXT: csel x15, x17, x15, lo
-; CHECK-GI-NEXT: sub x17, x14, x11
-; CHECK-GI-NEXT: csel x0, x3, xzr, lo
+; CHECK-GI-NEXT: extr x11, x3, x2, #63
+; CHECK-GI-NEXT: csel x0, x1, xzr, lo
+; CHECK-GI-NEXT: csel x13, x17, x13, lo
+; CHECK-GI-NEXT: sub x17, x12, x10
; CHECK-GI-NEXT: tst x16, #0x7f
-; CHECK-GI-NEXT: sub x16, x11, #64
+; CHECK-GI-NEXT: sub x16, x10, #64
; CHECK-GI-NEXT: lsr x17, x18, x17
-; CHECK-GI-NEXT: lsl x2, x13, x11
-; CHECK-GI-NEXT: lsl x1, x18, x11
-; CHECK-GI-NEXT: csel x12, x12, x15, eq
-; CHECK-GI-NEXT: lsl x15, x18, x16
-; CHECK-GI-NEXT: and x10, x9, #0x7f
-; CHECK-GI-NEXT: cmp x11, #64
-; CHECK-GI-NEXT: mvn x11, x8
+; CHECK-GI-NEXT: lsl x2, x11, x10
+; CHECK-GI-NEXT: lsl x1, x18, x10
+; CHECK-GI-NEXT: csel x13, x14, x13, eq
+; CHECK-GI-NEXT: lsl x14, x18, x16
+; CHECK-GI-NEXT: cmp x10, #64
+; CHECK-GI-NEXT: mvn x10, x8
; CHECK-GI-NEXT: orr x16, x17, x2
; CHECK-GI-NEXT: csel x17, x1, xzr, lo
-; CHECK-GI-NEXT: csel x15, x16, x15, lo
-; CHECK-GI-NEXT: tst x11, #0x7f
-; CHECK-GI-NEXT: sub x11, x14, x10
-; CHECK-GI-NEXT: sub x16, x10, #64
-; CHECK-GI-NEXT: lsr x18, x4, x10
-; CHECK-GI-NEXT: lsl x11, x5, x11
-; CHECK-GI-NEXT: csel x13, x13, x15, eq
-; CHECK-GI-NEXT: lsr x15, x5, x16
+; CHECK-GI-NEXT: csel x14, x16, x14, lo
+; CHECK-GI-NEXT: tst x10, #0x7f
+; CHECK-GI-NEXT: sub x10, x12, x15
+; CHECK-GI-NEXT: sub x16, x15, #64
+; CHECK-GI-NEXT: lsr x18, x4, x15
+; CHECK-GI-NEXT: lsl x10, x5, x10
+; CHECK-GI-NEXT: csel x11, x11, x14, eq
+; CHECK-GI-NEXT: lsr x14, x5, x16
; CHECK-GI-NEXT: and x1, x8, #0x7f
-; CHECK-GI-NEXT: orr x11, x18, x11
-; CHECK-GI-NEXT: cmp x10, #64
-; CHECK-GI-NEXT: lsr x16, x5, x10
-; CHECK-GI-NEXT: csel x11, x11, x15, lo
+; CHECK-GI-NEXT: cmp x15, #64
+; CHECK-GI-NEXT: lsr x16, x5, x15
+; CHECK-GI-NEXT: orr x10, x18, x10
+; CHECK-GI-NEXT: csel x10, x10, x14, lo
; CHECK-GI-NEXT: tst x9, #0x7f
-; CHECK-GI-NEXT: sub x9, x14, x1
-; CHECK-GI-NEXT: sub x14, x1, #64
-; CHECK-GI-NEXT: lsr x15, x6, x1
+; CHECK-GI-NEXT: sub x9, x12, x1
+; CHECK-GI-NEXT: sub x12, x1, #64
+; CHECK-GI-NEXT: lsr x14, x6, x1
; CHECK-GI-NEXT: lsl x9, x7, x9
-; CHECK-GI-NEXT: csel x11, x4, x11, eq
-; CHECK-GI-NEXT: cmp x10, #64
-; CHECK-GI-NEXT: lsr x10, x7, x14
-; CHECK-GI-NEXT: csel x14, x16, xzr, lo
-; CHECK-GI-NEXT: orr x9, x15, x9
+; CHECK-GI-NEXT: csel x10, x4, x10, eq
+; CHECK-GI-NEXT: cmp x15, #64
+; CHECK-GI-NEXT: lsr x12, x7, x12
+; CHECK-GI-NEXT: csel x15, x16, xzr, lo
+; CHECK-GI-NEXT: orr x9, x14, x9
; CHECK-GI-NEXT: cmp x1, #64
-; CHECK-GI-NEXT: lsr x15, x7, x1
-; CHECK-GI-NEXT: csel x9, x9, x10, lo
+; CHECK-GI-NEXT: lsr x14, x7, x1
+; CHECK-GI-NEXT: csel x9, x9, x12, lo
; CHECK-GI-NEXT: tst x8, #0x7f
; CHECK-GI-NEXT: csel x8, x6, x9, eq
; CHECK-GI-NEXT: cmp x1, #64
-; CHECK-GI-NEXT: orr x0, x0, x11
-; CHECK-GI-NEXT: csel x9, x15, xzr, lo
-; CHECK-GI-NEXT: orr x1, x12, x14
+; CHECK-GI-NEXT: orr x0, x0, x10
+; CHECK-GI-NEXT: csel x9, x14, xzr, lo
+; CHECK-GI-NEXT: orr x1, x13, x15
; CHECK-GI-NEXT: orr x2, x17, x8
-; CHECK-GI-NEXT: orr x3, x13, x9
+; CHECK-GI-NEXT: orr x3, x11, x9
; CHECK-GI-NEXT: ret
entry:
%d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %b, <2 x i128> %c)
@@ -3863,15 +3837,12 @@ define <2 x i128> @rotl_v2i128_c(<2 x i128> %a) {
;
; CHECK-GI-LABEL: rotl_v2i128_c:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: lsr x8, x1, #61
-; CHECK-GI-NEXT: lsl x9, x1, #3
-; CHECK-GI-NEXT: lsl x10, x3, #3
-; CHECK-GI-NEXT: lsr x11, x3, #61
-; CHECK-GI-NEXT: orr x8, x8, x0, lsl #3
-; CHECK-GI-NEXT: orr x1, x9, x0, lsr #61
-; CHECK-GI-NEXT: orr x3, x10, x2, lsr #61
-; CHECK-GI-NEXT: orr x2, x11, x2, lsl #3
+; CHECK-GI-NEXT: extr x8, x0, x1, #61
+; CHECK-GI-NEXT: extr x9, x3, x2, #61
+; CHECK-GI-NEXT: extr x1, x1, x0, #61
+; CHECK-GI-NEXT: extr x2, x2, x3, #61
; CHECK-GI-NEXT: mov x0, x8
+; CHECK-GI-NEXT: mov x3, x9
; CHECK-GI-NEXT: ret
entry:
%d = call <2 x i128> @llvm.fshl(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 3, i128 3>)
@@ -3891,14 +3862,12 @@ define <2 x i128> @rotr_v2i128_c(<2 x i128> %a) {
;
; CHECK-GI-LABEL: rotr_v2i128_c:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: lsl x8, x1, #61
-; CHECK-GI-NEXT: lsl x9, x3, #61
-; CHECK-GI-NEXT: lsl x10, x0, #61
-; CHECK-GI-NEXT: lsl x11, x2, #61
-; CHECK-GI-NEXT: orr x0, x8, x0, lsr #3
-; CHECK-GI-NEXT: orr x2, x9, x2, lsr #3
-; CHECK-GI-NEXT: orr x1, x10, x1, lsr #3
-; CHECK-GI-NEXT: orr x3, x11, x3, lsr #3
+; CHECK-GI-NEXT: extr x8, x1, x0, #3
+; CHECK-GI-NEXT: extr x9, x3, x2, #3
+; CHECK-GI-NEXT: extr x1, x0, x1, #3
+; CHECK-GI-NEXT: extr x3, x2, x3, #3
+; CHECK-GI-NEXT: mov x0, x8
+; CHECK-GI-NEXT: mov x2, x9
; CHECK-GI-NEXT: ret
entry:
%d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %a, <2 x i128> <i128 3, i128 3>)
@@ -4464,14 +4433,10 @@ define <2 x i128> @fshl_v2i128_c(<2 x i128> %a, <2 x i128> %b) {
;
; CHECK-GI-LABEL: fshl_v2i128_c:
; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: lsr x8, x5, #61
-; CHECK-GI-NEXT: lsl x9, x1, #3
-; CHECK-GI-NEXT: lsl x10, x3, #3
-; CHECK-GI-NEXT: lsr x11, x7, #61
-; CHECK-GI-NEXT: orr x8, x8, x0, lsl #3
-; CHECK-GI-NEXT: orr x1, x9, x0, lsr #61
-; CHECK-GI-NEXT: orr x3, x10, x2, lsr #61
-; CHECK-GI-NEXT: orr x2, x11, x2, lsl #3
+; CHECK-GI-NEXT: extr x8, x0, x5, #61
+; CHECK-GI-NEXT: extr x1, x1, x0, #61
+; CHECK-GI-NEXT: extr x3, x3, x2, #61
+; CHECK-GI-NEXT: extr x2, x2, x7, #61
; CHECK-GI-NEXT: mov x0, x8
; CHECK-GI-NEXT: ret
entry:
@@ -4480,29 +4445,15 @@ entry:
}
define <2 x i128> @fshr_v2i128_c(<2 x i128> %a, <2 x i128> %b) {
-; CHECK-SD-LABEL: fshr_v2i128_c:
-; CHECK-SD: // %bb.0: // %entry
-; CHECK-SD-NEXT: extr x8, x5, x4, #3
-; CHECK-SD-NEXT: extr x9, x7, x6, #3
-; CHECK-SD-NEXT: extr x1, x0, x5, #3
-; CHECK-SD-NEXT: extr x3, x2, x7, #3
-; CHECK-SD-NEXT: mov x0, x8
-; CHECK-SD-NEXT: mov x2, x9
-; CHECK-SD-NEXT: ret
-;
-; CHECK-GI-LABEL: fshr_v2i128_c:
-; CHECK-GI: // %bb.0: // %entry
-; CHECK-GI-NEXT: lsl x8, x5, #61
-; CHECK-GI-NEXT: lsl x9, x7, #61
-; CHECK-GI-NEXT: lsr x10, x5, #3
-; CHECK-GI-NEXT: lsr x11, x7, #3
-; CHECK-GI-NEXT: orr x8, x8, x4, lsr #3
-; CHECK-GI-NEXT: orr x9, x9, x6, lsr #3
-; CHECK-GI-NEXT: orr x1, x10, x0, lsl #61
-; CHECK-GI-NEXT: orr x3, x11, x2, lsl #61
-; CHECK-GI-NEXT: mov x0, x8
-; CHECK-GI-NEXT: mov x2, x9
-; CHECK-GI-NEXT: ret
+; CHECK-LABEL: fshr_v2i128_c:
+; CHECK: // %bb.0: // %entry
+; CHECK-NEXT: extr x8, x5, x4, #3
+; CHECK-NEXT: extr x9, x7, x6, #3
+; CHECK-NEXT: extr x1, x0, x5, #3
+; CHECK-NEXT: extr x3, x2, x7, #3
+; CHECK-NEXT: mov x0, x8
+; CHECK-NEXT: mov x2, x9
+; CHECK-NEXT: ret
entry:
%d = call <2 x i128> @llvm.fshr(<2 x i128> %a, <2 x i128> %b, <2 x i128> <i128 3, i128 3>)
ret <2 x i128> %d
diff --git a/llvm/test/CodeGen/AArch64/funnel-shift.ll b/llvm/test/CodeGen/AArch64/funnel-shift.ll
index f9fd2ad..90fb102 100644
--- a/llvm/test/CodeGen/AArch64/funnel-shift.ll
+++ b/llvm/test/CodeGen/AArch64/funnel-shift.ll
@@ -85,41 +85,40 @@ define i128 @fshl_i128(i128 %x, i128 %y, i128 %z) nounwind {
;
; CHECK-GI-LABEL: fshl_i128:
; CHECK-GI: // %bb.0:
+; CHECK-GI-NEXT: mov w8, #64 // =0x40
; CHECK-GI-NEXT: and x9, x4, #0x7f
-; CHECK-GI-NEXT: mov w10, #64 // =0x40
-; CHECK-GI-NEXT: lsl x14, x3, #63
-; CHECK-GI-NEXT: sub x12, x10, x9
+; CHECK-GI-NEXT: mov w10, #127 // =0x7f
+; CHECK-GI-NEXT: sub x12, x8, x9
; CHECK-GI-NEXT: lsl x13, x1, x9
-; CHECK-GI-NEXT: mov w8, #127 // =0x7f
+; CHECK-GI-NEXT: bic x10, x10, x4
; CHECK-GI-NEXT: lsr x12, x0, x12
-; CHECK-GI-NEXT: bic x8, x8, x4
-; CHECK-GI-NEXT: sub x15, x9, #64
+; CHECK-GI-NEXT: sub x14, x9, #64
+; CHECK-GI-NEXT: lsl x15, x0, x9
+; CHECK-GI-NEXT: extr x16, x3, x2, #1
; CHECK-GI-NEXT: cmp x9, #64
-; CHECK-GI-NEXT: lsl x9, x0, x9
-; CHECK-GI-NEXT: lsl x15, x0, x15
-; CHECK-GI-NEXT: orr x12, x12, x13
-; CHECK-GI-NEXT: orr x13, x14, x2, lsr #1
-; CHECK-GI-NEXT: lsr x14, x3, #1
-; CHECK-GI-NEXT: sub x10, x10, x8
-; CHECK-GI-NEXT: sub x16, x8, #64
-; CHECK-GI-NEXT: csel x9, x9, xzr, lo
-; CHECK-GI-NEXT: lsr x17, x13, x8
-; CHECK-GI-NEXT: lsl x10, x14, x10
-; CHECK-GI-NEXT: csel x12, x12, x15, lo
+; CHECK-GI-NEXT: sub x8, x8, x10
+; CHECK-GI-NEXT: orr x9, x12, x13
+; CHECK-GI-NEXT: lsr x12, x3, #1
+; CHECK-GI-NEXT: lsl x13, x0, x14
+; CHECK-GI-NEXT: csel x14, x15, xzr, lo
+; CHECK-GI-NEXT: sub x15, x10, #64
+; CHECK-GI-NEXT: lsr x17, x16, x10
+; CHECK-GI-NEXT: lsl x8, x12, x8
+; CHECK-GI-NEXT: csel x9, x9, x13, lo
; CHECK-GI-NEXT: tst x4, #0x7f
-; CHECK-GI-NEXT: lsr x15, x14, x16
+; CHECK-GI-NEXT: lsr x13, x12, x15
; CHECK-GI-NEXT: mvn x11, x4
-; CHECK-GI-NEXT: csel x12, x1, x12, eq
-; CHECK-GI-NEXT: orr x10, x17, x10
-; CHECK-GI-NEXT: cmp x8, #64
-; CHECK-GI-NEXT: lsr x14, x14, x8
-; CHECK-GI-NEXT: csel x10, x10, x15, lo
+; CHECK-GI-NEXT: csel x9, x1, x9, eq
+; CHECK-GI-NEXT: orr x8, x17, x8
+; CHECK-GI-NEXT: cmp x10, #64
+; CHECK-GI-NEXT: lsr x12, x12, x10
+; CHECK-GI-NEXT: csel x8, x8, x13, lo
; CHECK-GI-NEXT: tst x11, #0x7f
-; CHECK-GI-NEXT: csel x10, x13, x10, eq
-; CHECK-GI-NEXT: cmp x8, #64
-; CHECK-GI-NEXT: csel x8, x14, xzr, lo
-; CHECK-GI-NEXT: orr x0, x9, x10
-; CHECK-GI-NEXT: orr x1, x12, x8
+; CHECK-GI-NEXT: csel x8, x16, x8, eq
+; CHECK-GI-NEXT: cmp x10, #64
+; CHECK-GI-NEXT: csel x10, x12, xzr, lo
+; CHECK-GI-NEXT: orr x0, x14, x8
+; CHECK-GI-NEXT: orr x1, x9, x10
; CHECK-GI-NEXT: ret
%f = call i128 @llvm.fshl.i128(i128 %x, i128 %y, i128 %z)
ret i128 %f
diff --git a/llvm/test/CodeGen/AArch64/rem-by-const.ll b/llvm/test/CodeGen/AArch64/rem-by-const.ll
index 1cb92e4..87b1108 100644
--- a/llvm/test/CodeGen/AArch64/rem-by-const.ll
+++ b/llvm/test/CodeGen/AArch64/rem-by-const.ll
@@ -559,20 +559,18 @@ define i128 @ui128_7(i128 %a, i128 %b) {
; CHECK-GI-NEXT: add x8, x8, x10
; CHECK-GI-NEXT: subs x10, x0, x9
; CHECK-GI-NEXT: sbc x11, x1, x8
-; CHECK-GI-NEXT: lsl x12, x11, #63
+; CHECK-GI-NEXT: extr x10, x11, x10, #1
; CHECK-GI-NEXT: lsr x11, x11, #1
-; CHECK-GI-NEXT: orr x10, x12, x10, lsr #1
; CHECK-GI-NEXT: adds x9, x10, x9
+; CHECK-GI-NEXT: mov w10, #7 // =0x7
; CHECK-GI-NEXT: adc x8, x11, x8
-; CHECK-GI-NEXT: lsl x10, x8, #62
+; CHECK-GI-NEXT: extr x9, x8, x9, #2
; CHECK-GI-NEXT: lsr x8, x8, #2
-; CHECK-GI-NEXT: orr x9, x10, x9, lsr #2
-; CHECK-GI-NEXT: mov w10, #7 // =0x7
-; CHECK-GI-NEXT: lsl x12, x8, #3
; CHECK-GI-NEXT: umulh x10, x9, x10
; CHECK-GI-NEXT: lsl x11, x9, #3
-; CHECK-GI-NEXT: sub x8, x12, x8
+; CHECK-GI-NEXT: lsl x12, x8, #3
; CHECK-GI-NEXT: sub x9, x11, x9
+; CHECK-GI-NEXT: sub x8, x12, x8
; CHECK-GI-NEXT: subs x0, x0, x9
; CHECK-GI-NEXT: add x8, x8, x10
; CHECK-GI-NEXT: sbc x1, x1, x8
@@ -640,10 +638,9 @@ define i128 @ui128_100(i128 %a, i128 %b) {
; CHECK-GI-NEXT: add x10, x11, x12
; CHECK-GI-NEXT: add x8, x8, x14
; CHECK-GI-NEXT: add x8, x8, x10
-; CHECK-GI-NEXT: lsl x10, x8, #60
-; CHECK-GI-NEXT: lsr x8, x8, #4
-; CHECK-GI-NEXT: orr x9, x10, x9, lsr #4
; CHECK-GI-NEXT: mov w10, #100 // =0x64
+; CHECK-GI-NEXT: extr x9, x8, x9, #4
+; CHECK-GI-NEXT: lsr x8, x8, #4
; CHECK-GI-NEXT: umulh x11, x9, x10
; CHECK-GI-NEXT: mul x9, x9, x10
; CHECK-GI-NEXT: madd x8, x8, x10, x11
@@ -3317,36 +3314,32 @@ define <2 x i128> @uv2i128_7(<2 x i128> %d, <2 x i128> %e) {
; CHECK-GI-NEXT: sbc x14, x1, x12
; CHECK-GI-NEXT: add x8, x8, x13
; CHECK-GI-NEXT: subs x13, x2, x10
-; CHECK-GI-NEXT: lsl x15, x14, #63
-; CHECK-GI-NEXT: sbc x16, x3, x8
+; CHECK-GI-NEXT: extr x9, x14, x9, #1
+; CHECK-GI-NEXT: sbc x15, x3, x8
; CHECK-GI-NEXT: lsr x14, x14, #1
-; CHECK-GI-NEXT: orr x9, x15, x9, lsr #1
-; CHECK-GI-NEXT: lsl x15, x16, #63
-; CHECK-GI-NEXT: orr x13, x15, x13, lsr #1
+; CHECK-GI-NEXT: extr x13, x15, x13, #1
; CHECK-GI-NEXT: adds x9, x9, x11
-; CHECK-GI-NEXT: lsr x11, x16, #1
+; CHECK-GI-NEXT: lsr x11, x15, #1
; CHECK-GI-NEXT: adc x12, x14, x12
; CHECK-GI-NEXT: adds x10, x13, x10
-; CHECK-GI-NEXT: lsl x13, x12, #62
-; CHECK-GI-NEXT: lsr x12, x12, #2
-; CHECK-GI-NEXT: adc x8, x11, x8
-; CHECK-GI-NEXT: lsl x11, x8, #62
-; CHECK-GI-NEXT: orr x9, x13, x9, lsr #2
+; CHECK-GI-NEXT: extr x9, x12, x9, #2
; CHECK-GI-NEXT: mov w13, #7 // =0x7
+; CHECK-GI-NEXT: adc x8, x11, x8
+; CHECK-GI-NEXT: lsr x11, x12, #2
+; CHECK-GI-NEXT: extr x10, x8, x10, #2
+; CHECK-GI-NEXT: umulh x12, x9, x13
; CHECK-GI-NEXT: lsr x8, x8, #2
-; CHECK-GI-NEXT: lsl x14, x12, #3
-; CHECK-GI-NEXT: orr x10, x11, x10, lsr #2
-; CHECK-GI-NEXT: umulh x11, x9, x13
+; CHECK-GI-NEXT: lsl x14, x11, #3
; CHECK-GI-NEXT: lsl x15, x9, #3
-; CHECK-GI-NEXT: sub x12, x14, x12
-; CHECK-GI-NEXT: lsl x16, x8, #3
; CHECK-GI-NEXT: umulh x13, x10, x13
+; CHECK-GI-NEXT: lsl x16, x8, #3
+; CHECK-GI-NEXT: sub x11, x14, x11
; CHECK-GI-NEXT: lsl x14, x10, #3
; CHECK-GI-NEXT: sub x9, x15, x9
; CHECK-GI-NEXT: sub x8, x16, x8
; CHECK-GI-NEXT: subs x0, x0, x9
+; CHECK-GI-NEXT: add x11, x11, x12
; CHECK-GI-NEXT: sub x10, x14, x10
-; CHECK-GI-NEXT: add x11, x12, x11
; CHECK-GI-NEXT: sbc x1, x1, x11
; CHECK-GI-NEXT: subs x2, x2, x10
; CHECK-GI-NEXT: add x8, x8, x13
@@ -3394,9 +3387,10 @@ define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) {
; CHECK-GI: // %bb.0: // %entry
; CHECK-GI-NEXT: mov x10, #23593 // =0x5c29
; CHECK-GI-NEXT: mov x8, #62914 // =0xf5c2
-; CHECK-GI-NEXT: sub x18, x0, x0
+; CHECK-GI-NEXT: and x5, xzr, #0x1
; CHECK-GI-NEXT: movk x10, #49807, lsl #16
; CHECK-GI-NEXT: movk x8, #23592, lsl #16
+; CHECK-GI-NEXT: umulh x18, x0, xzr
; CHECK-GI-NEXT: movk x10, #10485, lsl #32
; CHECK-GI-NEXT: movk x8, #49807, lsl #32
; CHECK-GI-NEXT: movk x10, #36700, lsl #48
@@ -3409,84 +3403,81 @@ define <2 x i128> @uv2i128_100(<2 x i128> %d, <2 x i128> %e) {
; CHECK-GI-NEXT: umulh x15, x1, x10
; CHECK-GI-NEXT: cset w12, hs
; CHECK-GI-NEXT: cmn x11, x13
-; CHECK-GI-NEXT: and x11, x12, #0x1
-; CHECK-GI-NEXT: umulh x16, x0, x8
-; CHECK-GI-NEXT: cset w12, hs
+; CHECK-GI-NEXT: sub x13, x0, x0
; CHECK-GI-NEXT: and x12, x12, #0x1
-; CHECK-GI-NEXT: add x14, x14, x18
-; CHECK-GI-NEXT: add x11, x11, x12
-; CHECK-GI-NEXT: and x12, xzr, #0x1
+; CHECK-GI-NEXT: umulh x16, x0, x8
+; CHECK-GI-NEXT: cset w11, hs
+; CHECK-GI-NEXT: add x13, x14, x13
+; CHECK-GI-NEXT: and x11, x11, #0x1
+; CHECK-GI-NEXT: and x14, xzr, #0x1
; CHECK-GI-NEXT: umulh x9, xzr, x10
-; CHECK-GI-NEXT: adds x14, x14, x15
-; CHECK-GI-NEXT: and x15, xzr, #0x1
+; CHECK-GI-NEXT: add x11, x12, x11
+; CHECK-GI-NEXT: add x12, x5, x14
+; CHECK-GI-NEXT: adds x13, x13, x15
; CHECK-GI-NEXT: umulh x17, x1, x8
-; CHECK-GI-NEXT: cset w4, hs
-; CHECK-GI-NEXT: add x15, x12, x15
-; CHECK-GI-NEXT: adds x12, x14, x16
-; CHECK-GI-NEXT: and x4, x4, #0x1
-; CHECK-GI-NEXT: mul x18, x3, x10
; CHECK-GI-NEXT: cset w14, hs
-; CHECK-GI-NEXT: adds x12, x12, x11
-; CHECK-GI-NEXT: add x11, x15, x4
; CHECK-GI-NEXT: and x14, x14, #0x1
-; CHECK-GI-NEXT: cset w15, hs
-; CHECK-GI-NEXT: mul x5, x2, x8
-; CHECK-GI-NEXT: add x11, x11, x14
-; CHECK-GI-NEXT: and x14, x15, #0x1
-; CHECK-GI-NEXT: add x17, x9, x17
-; CHECK-GI-NEXT: add x14, x11, x14
-; CHECK-GI-NEXT: mov w11, #100 // =0x64
-; CHECK-GI-NEXT: umulh x13, x0, xzr
-; CHECK-GI-NEXT: umulh x16, x2, x10
-; CHECK-GI-NEXT: adds x18, x18, x5
-; CHECK-GI-NEXT: mul x15, x3, x8
-; CHECK-GI-NEXT: add x13, x17, x13
-; CHECK-GI-NEXT: cset w17, hs
-; CHECK-GI-NEXT: umulh x10, x3, x10
-; CHECK-GI-NEXT: add x13, x13, x14
-; CHECK-GI-NEXT: and x17, x17, #0x1
-; CHECK-GI-NEXT: cmn x18, x16
-; CHECK-GI-NEXT: sub x18, x2, x2
-; CHECK-GI-NEXT: umulh x16, x2, x8
+; CHECK-GI-NEXT: adds x13, x13, x16
+; CHECK-GI-NEXT: mul x4, x3, x10
+; CHECK-GI-NEXT: add x12, x12, x14
; CHECK-GI-NEXT: cset w14, hs
-; CHECK-GI-NEXT: and x14, x14, #0x1
-; CHECK-GI-NEXT: add x15, x15, x18
+; CHECK-GI-NEXT: adds x11, x13, x11
+; CHECK-GI-NEXT: and x13, x14, #0x1
+; CHECK-GI-NEXT: mul x15, x2, x8
+; CHECK-GI-NEXT: cset w14, hs
+; CHECK-GI-NEXT: add x12, x12, x13
+; CHECK-GI-NEXT: and x13, x14, #0x1
+; CHECK-GI-NEXT: add x14, x9, x17
+; CHECK-GI-NEXT: sub x17, x2, x2
+; CHECK-GI-NEXT: umulh x16, x2, x10
+; CHECK-GI-NEXT: add x12, x12, x13
+; CHECK-GI-NEXT: add x13, x14, x18
+; CHECK-GI-NEXT: add x12, x13, x12
; CHECK-GI-NEXT: and x18, xzr, #0x1
-; CHECK-GI-NEXT: add x14, x17, x14
+; CHECK-GI-NEXT: mul x5, x3, x8
+; CHECK-GI-NEXT: extr x11, x12, x11, #4
+; CHECK-GI-NEXT: adds x13, x4, x15
+; CHECK-GI-NEXT: umulh x14, x3, x10
+; CHECK-GI-NEXT: cset w15, hs
+; CHECK-GI-NEXT: mov w10, #100 // =0x64
+; CHECK-GI-NEXT: cmn x13, x16
+; CHECK-GI-NEXT: and x15, x15, #0x1
+; CHECK-GI-NEXT: umulh x13, x2, x8
+; CHECK-GI-NEXT: cset w16, hs
+; CHECK-GI-NEXT: add x17, x5, x17
+; CHECK-GI-NEXT: and x16, x16, #0x1
; CHECK-GI-NEXT: umulh x8, x3, x8
+; CHECK-GI-NEXT: add x15, x15, x16
+; CHECK-GI-NEXT: adds x14, x17, x14
; CHECK-GI-NEXT: and x17, xzr, #0x1
-; CHECK-GI-NEXT: adds x10, x15, x10
-; CHECK-GI-NEXT: add x15, x17, x18
+; CHECK-GI-NEXT: add x16, x18, x17
; CHECK-GI-NEXT: cset w17, hs
-; CHECK-GI-NEXT: umulh x18, x2, xzr
+; CHECK-GI-NEXT: adds x13, x14, x13
+; CHECK-GI-NEXT: umulh x14, x2, xzr
; CHECK-GI-NEXT: and x17, x17, #0x1
-; CHECK-GI-NEXT: adds x10, x10, x16
-; CHECK-GI-NEXT: lsl x16, x13, #60
-; CHECK-GI-NEXT: add x15, x15, x17
-; CHECK-GI-NEXT: cset w17, hs
-; CHECK-GI-NEXT: adds x10, x10, x14
-; CHECK-GI-NEXT: and x14, x17, #0x1
+; CHECK-GI-NEXT: cset w18, hs
+; CHECK-GI-NEXT: adds x13, x13, x15
+; CHECK-GI-NEXT: add x15, x16, x17
+; CHECK-GI-NEXT: and x16, x18, #0x1
; CHECK-GI-NEXT: cset w17, hs
; CHECK-GI-NEXT: add x8, x9, x8
-; CHECK-GI-NEXT: add x14, x15, x14
-; CHECK-GI-NEXT: and x15, x17, #0x1
-; CHECK-GI-NEXT: orr x12, x16, x12, lsr #4
-; CHECK-GI-NEXT: add x9, x14, x15
-; CHECK-GI-NEXT: add x8, x8, x18
-; CHECK-GI-NEXT: add x8, x8, x9
-; CHECK-GI-NEXT: lsr x9, x13, #4
-; CHECK-GI-NEXT: umulh x14, x12, x11
-; CHECK-GI-NEXT: lsl x13, x8, #60
+; CHECK-GI-NEXT: add x15, x15, x16
+; CHECK-GI-NEXT: and x16, x17, #0x1
+; CHECK-GI-NEXT: lsr x9, x12, #4
+; CHECK-GI-NEXT: add x15, x15, x16
+; CHECK-GI-NEXT: umulh x17, x11, x10
+; CHECK-GI-NEXT: add x8, x8, x14
+; CHECK-GI-NEXT: add x8, x8, x15
+; CHECK-GI-NEXT: mul x11, x11, x10
+; CHECK-GI-NEXT: extr x12, x8, x13, #4
; CHECK-GI-NEXT: lsr x8, x8, #4
-; CHECK-GI-NEXT: mul x12, x12, x11
-; CHECK-GI-NEXT: orr x10, x13, x10, lsr #4
-; CHECK-GI-NEXT: madd x9, x9, x11, x14
-; CHECK-GI-NEXT: umulh x13, x10, x11
-; CHECK-GI-NEXT: subs x0, x0, x12
-; CHECK-GI-NEXT: mul x10, x10, x11
+; CHECK-GI-NEXT: madd x9, x9, x10, x17
+; CHECK-GI-NEXT: umulh x13, x12, x10
+; CHECK-GI-NEXT: subs x0, x0, x11
+; CHECK-GI-NEXT: mul x12, x12, x10
; CHECK-GI-NEXT: sbc x1, x1, x9
-; CHECK-GI-NEXT: madd x8, x8, x11, x13
-; CHECK-GI-NEXT: subs x2, x2, x10
+; CHECK-GI-NEXT: madd x8, x8, x10, x13
+; CHECK-GI-NEXT: subs x2, x2, x12
; CHECK-GI-NEXT: sbc x3, x3, x8
; CHECK-GI-NEXT: ret
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll
index 221e2fd..09e1fca 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.ll
@@ -1200,7 +1200,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
; GFX6-NEXT: s_mov_b32 s5, s7
; GFX6-NEXT: s_mov_b32 s6, s8
; GFX6-NEXT: s_mov_b32 s7, s9
-; GFX6-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
+; GFX6-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_cmpswap_i32_1d_no_return:
@@ -1213,7 +1213,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
; GFX8-NEXT: s_mov_b32 s5, s7
; GFX8-NEXT: s_mov_b32 s6, s8
; GFX8-NEXT: s_mov_b32 s7, s9
-; GFX8-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
+; GFX8-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm
; GFX8-NEXT: s_endpgm
;
; GFX900-LABEL: atomic_cmpswap_i32_1d_no_return:
@@ -1226,7 +1226,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
; GFX900-NEXT: s_mov_b32 s5, s7
; GFX900-NEXT: s_mov_b32 s6, s8
; GFX900-NEXT: s_mov_b32 s7, s9
-; GFX900-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
+; GFX900-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm
; GFX900-NEXT: s_endpgm
;
; GFX90A-LABEL: atomic_cmpswap_i32_1d_no_return:
@@ -1239,7 +1239,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
; GFX90A-NEXT: s_mov_b32 s5, s7
; GFX90A-NEXT: s_mov_b32 s6, s8
; GFX90A-NEXT: s_mov_b32 s7, s9
-; GFX90A-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm glc
+; GFX90A-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 unorm
; GFX90A-NEXT: s_endpgm
;
; GFX10PLUS-LABEL: atomic_cmpswap_i32_1d_no_return:
@@ -1252,7 +1252,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
; GFX10PLUS-NEXT: s_mov_b32 s5, s7
; GFX10PLUS-NEXT: s_mov_b32 s6, s8
; GFX10PLUS-NEXT: s_mov_b32 s7, s9
-; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm glc
+; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm
; GFX10PLUS-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpswap_i32_1d_no_return:
@@ -1265,7 +1265,7 @@ define amdgpu_ps void @atomic_cmpswap_i32_1d_no_return(<8 x i32> inreg %rsrc, i3
; GFX12-NEXT: s_mov_b32 s5, s7
; GFX12-NEXT: s_mov_b32 s6, s8
; GFX12-NEXT: s_mov_b32 s7, s9
-; GFX12-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
; GFX12-NEXT: s_endpgm
main_body:
%v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -3194,7 +3194,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
; GFX6-NEXT: s_mov_b32 s5, s7
; GFX6-NEXT: s_mov_b32 s6, s8
; GFX6-NEXT: s_mov_b32 s7, s9
-; GFX6-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
+; GFX6-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm
; GFX6-NEXT: s_endpgm
;
; GFX8-LABEL: atomic_cmpswap_i64_1d_no_return:
@@ -3207,7 +3207,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
; GFX8-NEXT: s_mov_b32 s5, s7
; GFX8-NEXT: s_mov_b32 s6, s8
; GFX8-NEXT: s_mov_b32 s7, s9
-; GFX8-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
+; GFX8-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm
; GFX8-NEXT: s_endpgm
;
; GFX900-LABEL: atomic_cmpswap_i64_1d_no_return:
@@ -3220,7 +3220,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
; GFX900-NEXT: s_mov_b32 s5, s7
; GFX900-NEXT: s_mov_b32 s6, s8
; GFX900-NEXT: s_mov_b32 s7, s9
-; GFX900-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
+; GFX900-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm
; GFX900-NEXT: s_endpgm
;
; GFX90A-LABEL: atomic_cmpswap_i64_1d_no_return:
@@ -3233,7 +3233,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
; GFX90A-NEXT: s_mov_b32 s5, s7
; GFX90A-NEXT: s_mov_b32 s6, s8
; GFX90A-NEXT: s_mov_b32 s7, s9
-; GFX90A-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm glc
+; GFX90A-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf unorm
; GFX90A-NEXT: s_endpgm
;
; GFX10PLUS-LABEL: atomic_cmpswap_i64_1d_no_return:
@@ -3246,7 +3246,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
; GFX10PLUS-NEXT: s_mov_b32 s5, s7
; GFX10PLUS-NEXT: s_mov_b32 s6, s8
; GFX10PLUS-NEXT: s_mov_b32 s7, s9
-; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm glc
+; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
; GFX10PLUS-NEXT: s_endpgm
;
; GFX12-LABEL: atomic_cmpswap_i64_1d_no_return:
@@ -3259,7 +3259,7 @@ define amdgpu_ps void @atomic_cmpswap_i64_1d_no_return(<8 x i32> inreg %rsrc, i6
; GFX12-NEXT: s_mov_b32 s5, s7
; GFX12-NEXT: s_mov_b32 s6, s8
; GFX12-NEXT: s_mov_b32 s7, s9
-; GFX12-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
+; GFX12-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D
; GFX12-NEXT: s_endpgm
main_body:
%v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir
index 292fa4b..4f160b6 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.atomic.dim.mir
@@ -25,6 +25,7 @@ body: |
; GFX6-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_si]].sub0
; GFX6-NEXT: $vgpr0 = COPY [[COPY3]]
; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
; GFX8-LABEL: name: atomic_cmpswap_i32_1d
; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
; GFX8-NEXT: {{ $}}
@@ -35,6 +36,7 @@ body: |
; GFX8-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_vi]].sub0
; GFX8-NEXT: $vgpr0 = COPY [[COPY3]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
; GFX10-LABEL: name: atomic_cmpswap_i32_1d
; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
; GFX10-NEXT: {{ $}}
@@ -45,6 +47,7 @@ body: |
; GFX10-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10_]].sub0
; GFX10-NEXT: $vgpr0 = COPY [[COPY3]]
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
; GFX11-LABEL: name: atomic_cmpswap_i32_1d
; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
; GFX11-NEXT: {{ $}}
@@ -55,6 +58,7 @@ body: |
; GFX11-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11_]].sub0
; GFX11-NEXT: $vgpr0 = COPY [[COPY3]]
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0
+ ;
; GFX12-LABEL: name: atomic_cmpswap_i32_1d
; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
; GFX12-NEXT: {{ $}}
@@ -89,39 +93,43 @@ body: |
; GFX6-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX6-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_si:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_si [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX6-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_si [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
; GFX6-NEXT: S_ENDPGM 0
+ ;
; GFX8-LABEL: name: atomic_cmpswap_i32_1d_no_return
; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX8-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V1_V1_vi:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX8-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 3, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
+ ;
; GFX10-LABEL: name: atomic_cmpswap_i32_1d_no_return
; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX10-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10_:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX10-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
; GFX10-NEXT: S_ENDPGM 0
+ ;
; GFX11-LABEL: name: atomic_cmpswap_i32_1d_no_return
; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX11-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11_:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX11-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
; GFX11-NEXT: S_ENDPGM 0
+ ;
; GFX12-LABEL: name: atomic_cmpswap_i32_1d_no_return
; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1, $vgpr2
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_64 = COPY $vgpr0_vgpr1
; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr2
- ; GFX12-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx12_:%[0-9]+]]:vreg_64 = IMAGE_ATOMIC_CMPSWAP_V2_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 1, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
+ ; GFX12-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V2_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 3, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s32), addrspace 8)
; GFX12-NEXT: S_ENDPGM 0
%0:sgpr(<8 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
%1:vgpr(<2 x s32>) = COPY $vgpr0_vgpr1
@@ -150,6 +158,7 @@ body: |
; GFX6-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_si]].sub0_sub1
; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]]
; GFX6-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1
+ ;
; GFX8-LABEL: name: atomic_cmpswap_i64_1d
; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
; GFX8-NEXT: {{ $}}
@@ -160,6 +169,7 @@ body: |
; GFX8-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_vi]].sub0_sub1
; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]]
; GFX8-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1
+ ;
; GFX10-LABEL: name: atomic_cmpswap_i64_1d
; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
; GFX10-NEXT: {{ $}}
@@ -170,6 +180,7 @@ body: |
; GFX10-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx10_]].sub0_sub1
; GFX10-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]]
; GFX10-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1
+ ;
; GFX11-LABEL: name: atomic_cmpswap_i64_1d
; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
; GFX11-NEXT: {{ $}}
@@ -180,6 +191,7 @@ body: |
; GFX11-NEXT: [[COPY3:%[0-9]+]]:vreg_64 = COPY killed [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx11_]].sub0_sub1
; GFX11-NEXT: $vgpr0_vgpr1 = COPY [[COPY3]]
; GFX11-NEXT: SI_RETURN_TO_EPILOG implicit $vgpr0_vgpr1
+ ;
; GFX12-LABEL: name: atomic_cmpswap_i64_1d
; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
; GFX12-NEXT: {{ $}}
@@ -214,39 +226,43 @@ body: |
; GFX6-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX6-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
; GFX6-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX6-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_si:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_si [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
+ ; GFX6-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_si [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
; GFX6-NEXT: S_ENDPGM 0
+ ;
; GFX8-LABEL: name: atomic_cmpswap_i64_1d_no_return
; GFX8: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
; GFX8-NEXT: {{ $}}
; GFX8-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX8-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
; GFX8-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX8-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_vi:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
+ ; GFX8-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_vi [[COPY1]], [[COPY2]], [[COPY]], 15, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
; GFX8-NEXT: S_ENDPGM 0
+ ;
; GFX10-LABEL: name: atomic_cmpswap_i64_1d_no_return
; GFX10: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
; GFX10-NEXT: {{ $}}
; GFX10-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX10-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
; GFX10-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX10-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx10_:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
+ ; GFX10-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_gfx10 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
; GFX10-NEXT: S_ENDPGM 0
+ ;
; GFX11-LABEL: name: atomic_cmpswap_i64_1d_no_return
; GFX11: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
; GFX11-NEXT: {{ $}}
; GFX11-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX11-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
; GFX11-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX11-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx11_:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 1, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
+ ; GFX11-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_gfx11 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
; GFX11-NEXT: S_ENDPGM 0
+ ;
; GFX12-LABEL: name: atomic_cmpswap_i64_1d_no_return
; GFX12: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, $vgpr0_vgpr1_vgpr2_vgpr3, $vgpr4
; GFX12-NEXT: {{ $}}
; GFX12-NEXT: [[COPY:%[0-9]+]]:sgpr_256 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
; GFX12-NEXT: [[COPY1:%[0-9]+]]:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3
; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr4
- ; GFX12-NEXT: [[IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12_:%[0-9]+]]:vreg_128 = IMAGE_ATOMIC_CMPSWAP_V4_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 1, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
+ ; GFX12-NEXT: IMAGE_ATOMIC_CMPSWAP_NORTN_V4_V1_gfx12 [[COPY1]], [[COPY2]], [[COPY]], 15, 0, 0, 0, 0, 0, implicit $exec :: (volatile dereferenceable load store (s64), addrspace 8)
; GFX12-NEXT: S_ENDPGM 0
%0:sgpr(<8 x s32>) = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7
%1:vgpr(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
index 6c4f504..33ce278 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-uniform-waterfall.ll
@@ -23,7 +23,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero(ptr addrspace(1)
; PASS-CHECK: [[WHILE]]:
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -75,7 +77,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero_swap_op(ptr addrs
; PASS-CHECK: [[WHILE]]:
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 0, [[BALLOT]]
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -126,6 +130,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero(ptr addrspace(1)
; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
; PASS-CHECK: [[WHILE]]:
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[DONE]])
+; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp ne i64 0, [[BALLOT]]
; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -175,6 +181,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero_swap(ptr addrspac
; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
; PASS-CHECK: [[WHILE]]:
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[DONE]])
+; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp ne i64 [[BALLOT]], 0
; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -225,7 +233,9 @@ define protected amdgpu_kernel void @trivial_uniform_waterfall(ptr addrspace(1)
; PASS-CHECK: [[WHILE]]:
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ]
; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 0, 0
@@ -292,7 +302,9 @@ define protected amdgpu_kernel void @uniform_waterfall(ptr addrspace(1) %out, i3
; PASS-CHECK: [[WHILE]]:
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ [[NEW_DONE:%.*]], %[[TAIL:.*]] ]
; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i64 @llvm.amdgcn.ballot.i64(i1 [[NOT_DONE]])
; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i64 [[BALLOT]], 0
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF:.*]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: [[IS_FIRST_ACTIVE_ID:%.*]] = icmp eq i32 [[MYMASK]], [[MYMASK]]
@@ -359,7 +371,9 @@ define protected amdgpu_kernel void @trivial_waterfall_eq_zero_i32(ptr addrspace
; PASS-CHECK: [[WHILE]]:
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
; PASS-CHECK-NEXT: [[NOT_DONE:%.*]] = xor i1 [[DONE]], true
+; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[NOT_DONE]])
; PASS-CHECK-NEXT: [[TMP0:%.*]] = xor i1 [[NOT_DONE]], true
+; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp eq i32 [[BALLOT]], 0
; PASS-CHECK-NEXT: br i1 [[TMP0]], label %[[EXIT:.*]], label %[[IF]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
@@ -410,6 +424,8 @@ define protected amdgpu_kernel void @trivial_waterfall_ne_zero_i32(ptr addrspace
; PASS-CHECK-NEXT: br label %[[WHILE:.*]]
; PASS-CHECK: [[WHILE]]:
; PASS-CHECK-NEXT: [[DONE:%.*]] = phi i1 [ false, %[[ENTRY]] ], [ true, %[[IF:.*]] ]
+; PASS-CHECK-NEXT: [[BALLOT:%.*]] = tail call i32 @llvm.amdgcn.ballot.i32(i1 [[DONE]])
+; PASS-CHECK-NEXT: [[IS_DONE:%.*]] = icmp ne i32 0, [[BALLOT]]
; PASS-CHECK-NEXT: br i1 [[DONE]], label %[[EXIT:.*]], label %[[IF]]
; PASS-CHECK: [[IF]]:
; PASS-CHECK-NEXT: store i32 5, ptr addrspace(1) [[OUT]], align 4
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
index aa11574..a3e42e5 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-uniform-intrinsic-combine.ll
@@ -595,6 +595,8 @@ define amdgpu_kernel void @ballot_i32(i32 %v, ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i32(
; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; PASS-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
+; PASS-CHECK-NEXT: [[BALLOT:%.*]] = call i32 @llvm.amdgcn.ballot.i32(i1 [[C]])
+; PASS-CHECK-NEXT: [[BALLOT_NE_ZERO:%.*]] = icmp ne i32 [[BALLOT]], 0
; PASS-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1
; PASS-CHECK-NEXT: ret void
;
@@ -623,6 +625,8 @@ define amdgpu_kernel void @ballot_i64(i32 %v, ptr addrspace(1) %out) {
; PASS-CHECK-LABEL: define amdgpu_kernel void @ballot_i64(
; PASS-CHECK-SAME: i32 [[V:%.*]], ptr addrspace(1) [[OUT:%.*]]) #[[ATTR0]] {
; PASS-CHECK-NEXT: [[C:%.*]] = trunc i32 [[V]] to i1
+; PASS-CHECK-NEXT: [[BALLOT:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 [[C]])
+; PASS-CHECK-NEXT: [[BALLOT_NE_ZERO:%.*]] = icmp ne i64 [[BALLOT]], 0
; PASS-CHECK-NEXT: store i1 [[C]], ptr addrspace(1) [[OUT]], align 1
; PASS-CHECK-NEXT: ret void
;
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll
index 49607e3..83f0229 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.dim.gfx90a.ll
@@ -92,8 +92,7 @@ define amdgpu_ps void @atomic_swap_1d_agpr_noret(<8 x i32> inreg %rsrc, i32 %s)
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v1, a0
-; GFX90A-NEXT: image_atomic_swap v1, v0, s[0:7] dmask:0x1 unorm glc
+; GFX90A-NEXT: image_atomic_swap a0, v0, s[0:7] dmask:0x1 unorm
; GFX90A-NEXT: s_endpgm
%data = call i32 asm "; def $0", "=a"()
%unused = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -106,8 +105,7 @@ define amdgpu_ps void @atomic_add_2d_agpr_noret(<8 x i32> inreg %rsrc, i32 %s, i
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a0
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX90A-NEXT: image_atomic_add v2, v[0:1], s[0:7] dmask:0x1 unorm glc
+; GFX90A-NEXT: image_atomic_add a0, v[0:1], s[0:7] dmask:0x1 unorm
; GFX90A-NEXT: s_endpgm
%data = call i32 asm "; def $0", "=a"()
%unused = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
@@ -123,9 +121,7 @@ define amdgpu_ps void @atomic_cmpswap_1d_agpr_noret(<8 x i32> inreg %rsrc, i32 %
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a1
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: image_atomic_cmpswap v[2:3], v0, s[0:7] dmask:0x3 unorm glc
+; GFX90A-NEXT: image_atomic_cmpswap a[0:1], v0, s[0:7] dmask:0x3 unorm
; GFX90A-NEXT: s_endpgm
%cmp = call i32 asm "; def $0", "=a"()
%swap = call i32 asm "; def $0", "=a"()
@@ -139,9 +135,7 @@ define amdgpu_ps void @atomic_swap_1d_i64_agpr_noret(<8 x i32> inreg %rsrc, i32
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
-; GFX90A-NEXT: image_atomic_swap v[2:3], v0, s[0:7] dmask:0x3 unorm glc
+; GFX90A-NEXT: image_atomic_swap a[0:1], v0, s[0:7] dmask:0x3 unorm
; GFX90A-NEXT: s_endpgm
%data = call i64 asm "; def $0", "=a"()
%unused = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
@@ -154,14 +148,10 @@ define amdgpu_ps void @atomic_cmpswap_1d_64_agpr_noret(<8 x i32> inreg %rsrc, i3
; GFX90A-NEXT: ;;#ASMSTART
; GFX90A-NEXT: ; def a[0:1]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v3, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v2, a0
; GFX90A-NEXT: ;;#ASMSTART
-; GFX90A-NEXT: ; def a[0:1]
+; GFX90A-NEXT: ; def a[2:3]
; GFX90A-NEXT: ;;#ASMEND
-; GFX90A-NEXT: v_accvgpr_read_b32 v5, a1
-; GFX90A-NEXT: v_accvgpr_read_b32 v4, a0
-; GFX90A-NEXT: image_atomic_cmpswap v[2:5], v0, s[0:7] dmask:0xf unorm glc
+; GFX90A-NEXT: image_atomic_cmpswap a[0:3], v0, s[0:7] dmask:0xf unorm
; GFX90A-NEXT: s_endpgm
%cmp = call i64 asm "; def $0", "=a"()
%swap = call i64 asm "; def $0", "=a"()
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.noret.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.noret.ll
new file mode 100644
index 0000000..6c58a1a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.noret.ll
@@ -0,0 +1,581 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX10PLUS-GISE %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX10PLUS %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12-GISE %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefixes=GFX12 %s
+
+define amdgpu_ps void @atomic_swap_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_swap_1d:
+; GFX10PLUS-GISE: ; %bb.0:
+; GFX10PLUS-GISE-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT: s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_swap_1d:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT: s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_swap_1d:
+; GFX12-GISE: ; %bb.0:
+; GFX12-GISE-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_swap_1d:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.image.atomic.swap.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @atomic_swap_1d_i64(<8 x i32> inreg %rsrc, i64 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_swap_1d_i64:
+; GFX10PLUS-GISE: ; %bb.0:
+; GFX10PLUS-GISE-NEXT: image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT: s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_swap_1d_i64:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT: s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_swap_1d_i64:
+; GFX12-GISE: ; %bb.0:
+; GFX12-GISE-NEXT: image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_swap_1d_i64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: image_atomic_swap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT: s_endpgm
+ %v = call i64 @llvm.amdgcn.image.atomic.swap.1d.i64.i32(i64 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @atomic_swap_1d_float(<8 x i32> inreg %rsrc, float %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_swap_1d_float:
+; GFX10PLUS-GISE: ; %bb.0:
+; GFX10PLUS-GISE-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT: s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_swap_1d_float:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT: s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_swap_1d_float:
+; GFX12-GISE: ; %bb.0:
+; GFX12-GISE-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_swap_1d_float:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: image_atomic_swap v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT: s_endpgm
+ %v = call float @llvm.amdgcn.image.atomic.swap.1d.f32.i32(float %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @atomic_add_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_add_1d:
+; GFX10PLUS-GISE: ; %bb.0:
+; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT: s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_1d:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT: s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_1d:
+; GFX12-GISE: ; %bb.0:
+; GFX12-GISE-NEXT: image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_add_1d:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @atomic_sub_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_sub_1d:
+; GFX10PLUS-GISE: ; %bb.0:
+; GFX10PLUS-GISE-NEXT: image_atomic_sub v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT: s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_sub_1d:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: image_atomic_sub v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT: s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_sub_1d:
+; GFX12-GISE: ; %bb.0:
+; GFX12-GISE-NEXT: image_atomic_sub_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_sub_1d:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: image_atomic_sub_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.image.atomic.sub.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @atomic_smin_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_smin_1d:
+; GFX10PLUS-GISE: ; %bb.0:
+; GFX10PLUS-GISE-NEXT: image_atomic_smin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT: s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_smin_1d:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: image_atomic_smin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT: s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_smin_1d:
+; GFX12-GISE: ; %bb.0:
+; GFX12-GISE-NEXT: image_atomic_min_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_smin_1d:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: image_atomic_min_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.image.atomic.smin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @atomic_umin_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_umin_1d:
+; GFX10PLUS-GISE: ; %bb.0:
+; GFX10PLUS-GISE-NEXT: image_atomic_umin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT: s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_umin_1d:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: image_atomic_umin v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT: s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_umin_1d:
+; GFX12-GISE: ; %bb.0:
+; GFX12-GISE-NEXT: image_atomic_min_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_umin_1d:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: image_atomic_min_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.image.atomic.umin.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @atomic_smax_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_smax_1d:
+; GFX10PLUS-GISE: ; %bb.0:
+; GFX10PLUS-GISE-NEXT: image_atomic_smax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT: s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_smax_1d:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: image_atomic_smax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT: s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_smax_1d:
+; GFX12-GISE: ; %bb.0:
+; GFX12-GISE-NEXT: image_atomic_max_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_smax_1d:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: image_atomic_max_int v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.image.atomic.smax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @atomic_umax_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_umax_1d:
+; GFX10PLUS-GISE: ; %bb.0:
+; GFX10PLUS-GISE-NEXT: image_atomic_umax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT: s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_umax_1d:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: image_atomic_umax v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT: s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_umax_1d:
+; GFX12-GISE: ; %bb.0:
+; GFX12-GISE-NEXT: image_atomic_max_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_umax_1d:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: image_atomic_max_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.image.atomic.umax.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @atomic_and_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_and_1d:
+; GFX10PLUS-GISE: ; %bb.0:
+; GFX10PLUS-GISE-NEXT: image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT: s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_and_1d:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT: s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_and_1d:
+; GFX12-GISE: ; %bb.0:
+; GFX12-GISE-NEXT: image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_and_1d:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: image_atomic_and v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.image.atomic.and.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @atomic_or_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_or_1d:
+; GFX10PLUS-GISE: ; %bb.0:
+; GFX10PLUS-GISE-NEXT: image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT: s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_or_1d:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT: s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_or_1d:
+; GFX12-GISE: ; %bb.0:
+; GFX12-GISE-NEXT: image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_or_1d:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: image_atomic_or v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.image.atomic.or.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @atomic_xor_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_xor_1d:
+; GFX10PLUS-GISE: ; %bb.0:
+; GFX10PLUS-GISE-NEXT: image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT: s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_xor_1d:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT: s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_xor_1d:
+; GFX12-GISE: ; %bb.0:
+; GFX12-GISE-NEXT: image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_xor_1d:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: image_atomic_xor v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.image.atomic.xor.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @atomic_inc_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_inc_1d:
+; GFX10PLUS-GISE: ; %bb.0:
+; GFX10PLUS-GISE-NEXT: image_atomic_inc v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT: s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_inc_1d:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: image_atomic_inc v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT: s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_inc_1d:
+; GFX12-GISE: ; %bb.0:
+; GFX12-GISE-NEXT: image_atomic_inc_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_inc_1d:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: image_atomic_inc_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.image.atomic.inc.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @atomic_dec_1d(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_dec_1d:
+; GFX10PLUS-GISE: ; %bb.0:
+; GFX10PLUS-GISE-NEXT: image_atomic_dec v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT: s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_dec_1d:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: image_atomic_dec v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT: s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_dec_1d:
+; GFX12-GISE: ; %bb.0:
+; GFX12-GISE-NEXT: image_atomic_dec_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_dec_1d:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: image_atomic_dec_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.image.atomic.dec.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @atomic_cmpswap_1d(<8 x i32> inreg %rsrc, i32 %cmp, i32 %swap, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_cmpswap_1d:
+; GFX10PLUS-GISE: ; %bb.0:
+; GFX10PLUS-GISE-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT: s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_cmpswap_1d:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT: s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_cmpswap_1d:
+; GFX12-GISE: ; %bb.0:
+; GFX12-GISE-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_cmpswap_1d:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: image_atomic_cmpswap v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.image.atomic.cmpswap.1d.i32.i32(i32 %cmp, i32 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @atomic_cmpswap_1d_64(<8 x i32> inreg %rsrc, i64 %cmp, i64 %swap, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_cmpswap_1d_64:
+; GFX10PLUS-GISE: ; %bb.0:
+; GFX10PLUS-GISE-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-GISE-NEXT: s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_cmpswap_1d_64:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm
+; GFX10PLUS-NEXT: s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_cmpswap_1d_64:
+; GFX12-GISE: ; %bb.0:
+; GFX12-GISE-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-GISE-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_cmpswap_1d_64:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: image_atomic_cmpswap v[0:3], v4, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D
+; GFX12-NEXT: s_endpgm
+ %v = call i64 @llvm.amdgcn.image.atomic.cmpswap.1d.i64.i32(i64 %cmp, i64 %swap, i32 %s, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @atomic_add_2d(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t) {
+; GFX10PLUS-GISE-LABEL: atomic_add_2d:
+; GFX10PLUS-GISE: ; %bb.0:
+; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm
+; GFX10PLUS-GISE-NEXT: s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_2d:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D unorm
+; GFX10PLUS-NEXT: s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_2d:
+; GFX12-GISE: ; %bb.0:
+; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX12-GISE-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_add_2d:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.image.atomic.add.2d.i32.i32(i32 %data, i32 %s, i32 %t, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @atomic_add_3d(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %r) {
+; GFX10PLUS-GISE-LABEL: atomic_add_3d:
+; GFX10PLUS-GISE: ; %bb.0:
+; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm
+; GFX10PLUS-GISE-NEXT: s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_3d:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D unorm
+; GFX10PLUS-NEXT: s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_3d:
+; GFX12-GISE: ; %bb.0:
+; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D
+; GFX12-GISE-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_add_3d:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_3D
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.image.atomic.add.3d.i32.i32(i32 %data, i32 %s, i32 %t, i32 %r, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @atomic_add_cube(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %face) {
+; GFX10PLUS-GISE-LABEL: atomic_add_cube:
+; GFX10PLUS-GISE: ; %bb.0:
+; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE unorm
+; GFX10PLUS-GISE-NEXT: s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_cube:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE unorm
+; GFX10PLUS-NEXT: s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_cube:
+; GFX12-GISE: ; %bb.0:
+; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE
+; GFX12-GISE-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_add_cube:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_CUBE
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.image.atomic.add.cube.i32.i32(i32 %data, i32 %s, i32 %t, i32 %face, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @atomic_add_1darray(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %slice) {
+; GFX10PLUS-GISE-LABEL: atomic_add_1darray:
+; GFX10PLUS-GISE: ; %bb.0:
+; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY unorm
+; GFX10PLUS-GISE-NEXT: s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_1darray:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: image_atomic_add v0, v[1:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY unorm
+; GFX10PLUS-NEXT: s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_1darray:
+; GFX12-GISE: ; %bb.0:
+; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY
+; GFX12-GISE-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_add_1darray:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D_ARRAY
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.image.atomic.add.1darray.i32.i32(i32 %data, i32 %s, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @atomic_add_2darray(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %slice) {
+; GFX10PLUS-GISE-LABEL: atomic_add_2darray:
+; GFX10PLUS-GISE: ; %bb.0:
+; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY unorm
+; GFX10PLUS-GISE-NEXT: s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_2darray:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY unorm
+; GFX10PLUS-NEXT: s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_2darray:
+; GFX12-GISE: ; %bb.0:
+; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX12-GISE-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_add_2darray:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_ARRAY
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.image.atomic.add.2darray.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @atomic_add_2dmsaa(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %fragid) {
+; GFX10PLUS-GISE-LABEL: atomic_add_2dmsaa:
+; GFX10PLUS-GISE: ; %bb.0:
+; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+; GFX10PLUS-GISE-NEXT: s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_2dmsaa:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: image_atomic_add v0, v[1:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm
+; GFX10PLUS-NEXT: s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_2dmsaa:
+; GFX12-GISE: ; %bb.0:
+; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA
+; GFX12-GISE-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_add_2dmsaa:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.image.atomic.add.2dmsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @atomic_add_2darraymsaa(<8 x i32> inreg %rsrc, i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
+; GFX10PLUS-GISE-LABEL: atomic_add_2darraymsaa:
+; GFX10PLUS-GISE: ; %bb.0:
+; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm
+; GFX10PLUS-GISE-NEXT: s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_2darraymsaa:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: image_atomic_add v0, v[1:4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm
+; GFX10PLUS-NEXT: s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_2darraymsaa:
+; GFX12-GISE: ; %bb.0:
+; GFX12-GISE-NEXT: image_atomic_add_uint v0, [v1, v2, v3, v4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY
+; GFX12-GISE-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_add_2darraymsaa:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: image_atomic_add_uint v0, [v1, v2, v3, v4], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.image.atomic.add.2darraymsaa.i32.i32(i32 %data, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 0, i32 0)
+ ret void
+}
+
+define amdgpu_ps void @atomic_add_1d_slc(<8 x i32> inreg %rsrc, i32 %data, i32 %s) {
+; GFX10PLUS-GISE-LABEL: atomic_add_1d_slc:
+; GFX10PLUS-GISE: ; %bb.0:
+; GFX10PLUS-GISE-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm slc
+; GFX10PLUS-GISE-NEXT: s_endpgm
+;
+; GFX10PLUS-LABEL: atomic_add_1d_slc:
+; GFX10PLUS: ; %bb.0:
+; GFX10PLUS-NEXT: image_atomic_add v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D unorm slc
+; GFX10PLUS-NEXT: s_endpgm
+;
+; GFX12-GISE-LABEL: atomic_add_1d_slc:
+; GFX12-GISE: ; %bb.0:
+; GFX12-GISE-NEXT: image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT
+; GFX12-GISE-NEXT: s_endpgm
+;
+; GFX12-LABEL: atomic_add_1d_slc:
+; GFX12: ; %bb.0:
+; GFX12-NEXT: image_atomic_add_uint v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT
+; GFX12-NEXT: s_endpgm
+ %v = call i32 @llvm.amdgcn.image.atomic.add.1d.i32.i32(i32 %data, i32 %s, <8 x i32> %rsrc, i32 0, i32 2)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
index 3d1d6c8..0ba62e4 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.atomic.pk.add.ll
@@ -41,15 +41,13 @@ main_body:
define amdgpu_ps float @atomic_pk_add_f16_1d_v2_noret(<8 x i32> inreg %rsrc, <2 x half> %data, i32 %s) {
; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v2_noret:
; GFX12-SDAG: ; %bb.0: ; %main_body
-; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0
; GFX12-SDAG-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-LABEL: atomic_pk_add_f16_1d_v2_noret:
; GFX12-GISEL: ; %bb.0: ; %main_body
-; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0
; GFX12-GISEL-NEXT: ; return to shader part epilog
main_body:
@@ -79,15 +77,13 @@ main_body:
define amdgpu_ps float @atomic_pk_add_f16_1d_v4_noret(<8 x i32> inreg %rsrc, <4 x half> %data, i32 %s) {
; GFX12-SDAG-LABEL: atomic_pk_add_f16_1d_v4_noret:
; GFX12-SDAG: ; %bb.0: ; %main_body
-; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0
; GFX12-SDAG-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-LABEL: atomic_pk_add_f16_1d_v4_noret:
; GFX12-GISEL: ; %bb.0: ; %main_body
-; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: image_atomic_pk_add_f16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0
; GFX12-GISEL-NEXT: ; return to shader part epilog
main_body:
@@ -126,15 +122,13 @@ main_body:
define amdgpu_ps float @atomic_pk_add_bf16_1d_v2_noret(<8 x i32> inreg %rsrc, <2 x bfloat> %data, i32 %s) {
; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v2_noret:
; GFX12-SDAG: ; %bb.0: ; %main_body
-; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0
; GFX12-SDAG-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v2_noret:
; GFX12-GISEL: ; %bb.0: ; %main_body
-; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v0, v1, s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_1D
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0
; GFX12-GISEL-NEXT: ; return to shader part epilog
main_body:
@@ -173,15 +167,13 @@ main_body:
define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_noret(<8 x i32> inreg %rsrc, <4 x bfloat> %data, i32 %s) {
; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v4_noret:
; GFX12-SDAG: ; %bb.0: ; %main_body
-; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0
; GFX12-SDAG-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4_noret:
; GFX12-GISEL: ; %bb.0: ; %main_body
-; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_RETURN
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0
; GFX12-GISEL-NEXT: ; return to shader part epilog
main_body:
@@ -192,15 +184,13 @@ main_body:
define amdgpu_ps float @atomic_pk_add_bf16_1d_v4_nt(<8 x i32> inreg %rsrc, <4 x bfloat> %data, i32 %s) {
; GFX12-SDAG-LABEL: atomic_pk_add_bf16_1d_v4_nt:
; GFX12-SDAG: ; %bb.0: ; %main_body
-; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN
-; GFX12-SDAG-NEXT: s_wait_loadcnt 0x0
+; GFX12-SDAG-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT
; GFX12-SDAG-NEXT: v_mov_b32_e32 v0, 1.0
; GFX12-SDAG-NEXT: ; return to shader part epilog
;
; GFX12-GISEL-LABEL: atomic_pk_add_bf16_1d_v4_nt:
; GFX12-GISEL: ; %bb.0: ; %main_body
-; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT_RETURN
-; GFX12-GISEL-NEXT: s_wait_loadcnt 0x0
+; GFX12-GISEL-NEXT: image_atomic_pk_add_bf16 v[0:1], v2, s[0:7] dmask:0x3 dim:SQ_RSRC_IMG_1D th:TH_ATOMIC_NT
; GFX12-GISEL-NEXT: v_mov_b32_e32 v0, 1.0
; GFX12-GISEL-NEXT: ; return to shader part epilog
main_body:
diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/wave-ops.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/wave-ops.ll
index 7a876f6..3544017 100644
--- a/llvm/test/CodeGen/DirectX/ShaderFlags/wave-ops.ll
+++ b/llvm/test/CodeGen/DirectX/ShaderFlags/wave-ops.ll
@@ -76,6 +76,20 @@ entry:
ret i32 %ret
}
+define noundef i32 @wave_reduce_min(i32 noundef %x) {
+entry:
+ ; CHECK: Function wave_reduce_min : [[WAVE_FLAG]]
+ %ret = call i32 @llvm.dx.wave.reduce.min.i32(i32 %x)
+ ret i32 %ret
+}
+
+define noundef i32 @wave_reduce_umin(i32 noundef %x) {
+entry:
+ ; CHECK: Function wave_reduce_umin : [[WAVE_FLAG]]
+ %ret = call i32 @llvm.dx.wave.reduce.umin.i32(i32 %x)
+ ret i32 %ret
+}
+
define void @wave_active_countbits(i1 %expr) {
entry:
; CHECK: Function wave_active_countbits : [[WAVE_FLAG]]
diff --git a/llvm/test/CodeGen/DirectX/WaveActiveMin.ll b/llvm/test/CodeGen/DirectX/WaveActiveMin.ll
new file mode 100644
index 0000000..24fde48
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/WaveActiveMin.ll
@@ -0,0 +1,143 @@
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library < %s | FileCheck %s
+
+; Test that for scalar values, WaveActiveMin maps down to the DirectX op
+
+define noundef half @wave_active_min_half(half noundef %expr) {
+entry:
+; CHECK: call half @dx.op.waveActiveOp.f16(i32 119, half %expr, i8 2, i8 0){{$}}
+ %ret = call half @llvm.dx.wave.reduce.min.f16(half %expr)
+ ret half %ret
+}
+
+define noundef float @wave_active_min_float(float noundef %expr) {
+entry:
+; CHECK: call float @dx.op.waveActiveOp.f32(i32 119, float %expr, i8 2, i8 0){{$}}
+ %ret = call float @llvm.dx.wave.reduce.min.f32(float %expr)
+ ret float %ret
+}
+
+define noundef double @wave_active_min_double(double noundef %expr) {
+entry:
+; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr, i8 2, i8 0){{$}}
+ %ret = call double @llvm.dx.wave.reduce.min.f64(double %expr)
+ ret double %ret
+}
+
+define noundef i16 @wave_active_min_i16(i16 noundef %expr) {
+entry:
+; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr, i8 2, i8 0){{$}}
+ %ret = call i16 @llvm.dx.wave.reduce.min.i16(i16 %expr)
+ ret i16 %ret
+}
+
+define noundef i32 @wave_active_min_i32(i32 noundef %expr) {
+entry:
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr, i8 2, i8 0){{$}}
+ %ret = call i32 @llvm.dx.wave.reduce.min.i32(i32 %expr)
+ ret i32 %ret
+}
+
+define noundef i64 @wave_active_min_i64(i64 noundef %expr) {
+entry:
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr, i8 2, i8 0){{$}}
+ %ret = call i64 @llvm.dx.wave.reduce.min.i64(i64 %expr)
+ ret i64 %ret
+}
+
+define noundef i16 @wave_active_umin_i16(i16 noundef %expr) {
+entry:
+; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr, i8 2, i8 1){{$}}
+ %ret = call i16 @llvm.dx.wave.reduce.umin.i16(i16 %expr)
+ ret i16 %ret
+}
+
+define noundef i32 @wave_active_umin_i32(i32 noundef %expr) {
+entry:
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr, i8 2, i8 1){{$}}
+ %ret = call i32 @llvm.dx.wave.reduce.umin.i32(i32 %expr)
+ ret i32 %ret
+}
+
+define noundef i64 @wave_active_umin_i64(i64 noundef %expr) {
+entry:
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr, i8 2, i8 1){{$}}
+ %ret = call i64 @llvm.dx.wave.reduce.umin.i64(i64 %expr)
+ ret i64 %ret
+}
+
+declare half @llvm.dx.wave.reduce.min.f16(half)
+declare float @llvm.dx.wave.reduce.min.f32(float)
+declare double @llvm.dx.wave.reduce.min.f64(double)
+
+declare i16 @llvm.dx.wave.reduce.min.i16(i16)
+declare i32 @llvm.dx.wave.reduce.min.i32(i32)
+declare i64 @llvm.dx.wave.reduce.min.i64(i64)
+
+declare i16 @llvm.dx.wave.reduce.umin.i16(i16)
+declare i32 @llvm.dx.wave.reduce.umin.i32(i32)
+declare i64 @llvm.dx.wave.reduce.umin.i64(i64)
+
+; Test that for vector values, WaveActiveMin scalarizes and maps down to the
+; DirectX op
+
+define noundef <2 x half> @wave_active_min_v2half(<2 x half> noundef %expr) {
+entry:
+; CHECK: call half @dx.op.waveActiveOp.f16(i32 119, half %expr.i0, i8 2, i8 0){{$}}
+; CHECK: call half @dx.op.waveActiveOp.f16(i32 119, half %expr.i1, i8 2, i8 0){{$}}
+ %ret = call <2 x half> @llvm.dx.wave.reduce.min.v2f16(<2 x half> %expr)
+ ret <2 x half> %ret
+}
+
+define noundef <3 x i32> @wave_active_min_v3i32(<3 x i32> noundef %expr) {
+entry:
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i0, i8 2, i8 0){{$}}
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i1, i8 2, i8 0){{$}}
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i2, i8 2, i8 0){{$}}
+ %ret = call <3 x i32> @llvm.dx.wave.reduce.min.v3i32(<3 x i32> %expr)
+ ret <3 x i32> %ret
+}
+
+define noundef <4 x double> @wave_active_min_v4f64(<4 x double> noundef %expr) {
+entry:
+; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i0, i8 2, i8 0){{$}}
+; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i1, i8 2, i8 0){{$}}
+; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i2, i8 2, i8 0){{$}}
+; CHECK: call double @dx.op.waveActiveOp.f64(i32 119, double %expr.i3, i8 2, i8 0){{$}}
+ %ret = call <4 x double> @llvm.dx.wave.reduce.min.v4f64(<4 x double> %expr)
+ ret <4 x double> %ret
+}
+
+declare <2 x half> @llvm.dx.wave.reduce.min.v2f16(<2 x half>)
+declare <3 x i32> @llvm.dx.wave.reduce.min.v3i32(<3 x i32>)
+declare <4 x double> @llvm.dx.wave.reduce.min.v4f64(<4 x double>)
+
+define noundef <2 x i16> @wave_active_umin_v2i16(<2 x i16> noundef %expr) {
+entry:
+; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr.i0, i8 2, i8 1){{$}}
+; CHECK: call i16 @dx.op.waveActiveOp.i16(i32 119, i16 %expr.i1, i8 2, i8 1){{$}}
+ %ret = call <2 x i16> @llvm.dx.wave.reduce.umin.v2f16(<2 x i16> %expr)
+ ret <2 x i16> %ret
+}
+
+define noundef <3 x i32> @wave_active_umin_v3i32(<3 x i32> noundef %expr) {
+entry:
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i0, i8 2, i8 1){{$}}
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i1, i8 2, i8 1){{$}}
+; CHECK: call i32 @dx.op.waveActiveOp.i32(i32 119, i32 %expr.i2, i8 2, i8 1){{$}}
+ %ret = call <3 x i32> @llvm.dx.wave.reduce.umin.v3i32(<3 x i32> %expr)
+ ret <3 x i32> %ret
+}
+
+define noundef <4 x i64> @wave_active_umin_v4f64(<4 x i64> noundef %expr) {
+entry:
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i0, i8 2, i8 1){{$}}
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i1, i8 2, i8 1){{$}}
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i2, i8 2, i8 1){{$}}
+; CHECK: call i64 @dx.op.waveActiveOp.i64(i32 119, i64 %expr.i3, i8 2, i8 1){{$}}
+ %ret = call <4 x i64> @llvm.dx.wave.reduce.umin.v4f64(<4 x i64> %expr)
+ ret <4 x i64> %ret
+}
+
+declare <2 x i16> @llvm.dx.wave.reduce.umin.v2f16(<2 x i16>)
+declare <3 x i32> @llvm.dx.wave.reduce.umin.v3i32(<3 x i32>)
+declare <4 x i64> @llvm.dx.wave.reduce.umin.v4f64(<4 x i64>)
diff --git a/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll b/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll
new file mode 100644
index 0000000..48ec98c
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/fp-max-min.ll
@@ -0,0 +1,160 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lasx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx < %s | FileCheck %s
+
+define void @minnum_v8f32(ptr %res, ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: minnum_v8f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xvld $xr0, $a2, 0
+; CHECK-NEXT: xvld $xr1, $a1, 0
+; CHECK-NEXT: xvpickve.w $xr2, $xr0, 5
+; CHECK-NEXT: xvpickve.w $xr3, $xr1, 5
+; CHECK-NEXT: fmin.s $fa2, $fa3, $fa2
+; CHECK-NEXT: xvpickve.w $xr3, $xr0, 4
+; CHECK-NEXT: xvpickve.w $xr4, $xr1, 4
+; CHECK-NEXT: fmin.s $fa3, $fa4, $fa3
+; CHECK-NEXT: vextrins.w $vr3, $vr2, 16
+; CHECK-NEXT: xvpickve.w $xr2, $xr0, 6
+; CHECK-NEXT: xvpickve.w $xr4, $xr1, 6
+; CHECK-NEXT: fmin.s $fa2, $fa4, $fa2
+; CHECK-NEXT: vextrins.w $vr3, $vr2, 32
+; CHECK-NEXT: xvpickve.w $xr2, $xr0, 7
+; CHECK-NEXT: xvpickve.w $xr4, $xr1, 7
+; CHECK-NEXT: fmin.s $fa2, $fa4, $fa2
+; CHECK-NEXT: vextrins.w $vr3, $vr2, 48
+; CHECK-NEXT: xvpickve.w $xr2, $xr0, 1
+; CHECK-NEXT: xvpickve.w $xr4, $xr1, 1
+; CHECK-NEXT: fmin.s $fa2, $fa4, $fa2
+; CHECK-NEXT: xvpickve.w $xr4, $xr0, 0
+; CHECK-NEXT: xvpickve.w $xr5, $xr1, 0
+; CHECK-NEXT: fmin.s $fa4, $fa5, $fa4
+; CHECK-NEXT: vextrins.w $vr4, $vr2, 16
+; CHECK-NEXT: xvpickve.w $xr2, $xr0, 2
+; CHECK-NEXT: xvpickve.w $xr5, $xr1, 2
+; CHECK-NEXT: fmin.s $fa2, $fa5, $fa2
+; CHECK-NEXT: vextrins.w $vr4, $vr2, 32
+; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3
+; CHECK-NEXT: xvpickve.w $xr1, $xr1, 3
+; CHECK-NEXT: fmin.s $fa0, $fa1, $fa0
+; CHECK-NEXT: vextrins.w $vr4, $vr0, 48
+; CHECK-NEXT: xvpermi.q $xr4, $xr3, 2
+; CHECK-NEXT: xvst $xr4, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %v0 = load <8 x float>, ptr %x
+ %v1 = load <8 x float>, ptr %y
+ %r = call <8 x float> @llvm.minnum.v8f32(<8 x float> %v0, <8 x float> %v1)
+ store <8 x float> %r, ptr %res
+ ret void
+}
+
+define void @minnum_v4f64(ptr %res, ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: minnum_v4f64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xvld $xr0, $a2, 0
+; CHECK-NEXT: xvld $xr1, $a1, 0
+; CHECK-NEXT: xvpickve.d $xr2, $xr0, 3
+; CHECK-NEXT: xvpickve.d $xr3, $xr1, 3
+; CHECK-NEXT: fmin.d $fa2, $fa3, $fa2
+; CHECK-NEXT: xvpickve.d $xr3, $xr0, 2
+; CHECK-NEXT: xvpickve.d $xr4, $xr1, 2
+; CHECK-NEXT: fmin.d $fa3, $fa4, $fa3
+; CHECK-NEXT: vextrins.d $vr3, $vr2, 16
+; CHECK-NEXT: xvpickve.d $xr2, $xr0, 1
+; CHECK-NEXT: xvpickve.d $xr4, $xr1, 1
+; CHECK-NEXT: fmin.d $fa2, $fa4, $fa2
+; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0
+; CHECK-NEXT: xvpickve.d $xr1, $xr1, 0
+; CHECK-NEXT: fmin.d $fa0, $fa1, $fa0
+; CHECK-NEXT: vextrins.d $vr0, $vr2, 16
+; CHECK-NEXT: xvpermi.q $xr0, $xr3, 2
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %v0 = load <4 x double>, ptr %x
+ %v1 = load <4 x double>, ptr %y
+ %r = call <4 x double> @llvm.minnum.v4f64(<4 x double> %v0, <4 x double> %v1)
+ store <4 x double> %r, ptr %res
+ ret void
+}
+
+define void @maxnum_v8f32(ptr %res, ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: maxnum_v8f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xvld $xr0, $a2, 0
+; CHECK-NEXT: xvld $xr1, $a1, 0
+; CHECK-NEXT: xvpickve.w $xr2, $xr0, 5
+; CHECK-NEXT: xvpickve.w $xr3, $xr1, 5
+; CHECK-NEXT: fmax.s $fa2, $fa3, $fa2
+; CHECK-NEXT: xvpickve.w $xr3, $xr0, 4
+; CHECK-NEXT: xvpickve.w $xr4, $xr1, 4
+; CHECK-NEXT: fmax.s $fa3, $fa4, $fa3
+; CHECK-NEXT: vextrins.w $vr3, $vr2, 16
+; CHECK-NEXT: xvpickve.w $xr2, $xr0, 6
+; CHECK-NEXT: xvpickve.w $xr4, $xr1, 6
+; CHECK-NEXT: fmax.s $fa2, $fa4, $fa2
+; CHECK-NEXT: vextrins.w $vr3, $vr2, 32
+; CHECK-NEXT: xvpickve.w $xr2, $xr0, 7
+; CHECK-NEXT: xvpickve.w $xr4, $xr1, 7
+; CHECK-NEXT: fmax.s $fa2, $fa4, $fa2
+; CHECK-NEXT: vextrins.w $vr3, $vr2, 48
+; CHECK-NEXT: xvpickve.w $xr2, $xr0, 1
+; CHECK-NEXT: xvpickve.w $xr4, $xr1, 1
+; CHECK-NEXT: fmax.s $fa2, $fa4, $fa2
+; CHECK-NEXT: xvpickve.w $xr4, $xr0, 0
+; CHECK-NEXT: xvpickve.w $xr5, $xr1, 0
+; CHECK-NEXT: fmax.s $fa4, $fa5, $fa4
+; CHECK-NEXT: vextrins.w $vr4, $vr2, 16
+; CHECK-NEXT: xvpickve.w $xr2, $xr0, 2
+; CHECK-NEXT: xvpickve.w $xr5, $xr1, 2
+; CHECK-NEXT: fmax.s $fa2, $fa5, $fa2
+; CHECK-NEXT: vextrins.w $vr4, $vr2, 32
+; CHECK-NEXT: xvpickve.w $xr0, $xr0, 3
+; CHECK-NEXT: xvpickve.w $xr1, $xr1, 3
+; CHECK-NEXT: fmax.s $fa0, $fa1, $fa0
+; CHECK-NEXT: vextrins.w $vr4, $vr0, 48
+; CHECK-NEXT: xvpermi.q $xr4, $xr3, 2
+; CHECK-NEXT: xvst $xr4, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %v0 = load <8 x float>, ptr %x
+ %v1 = load <8 x float>, ptr %y
+ %r = call <8 x float> @llvm.maxnum.v8f32(<8 x float> %v0, <8 x float> %v1)
+ store <8 x float> %r, ptr %res
+ ret void
+}
+
+define void @maxnum_v4f64(ptr %res, ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: maxnum_v4f64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: xvld $xr0, $a2, 0
+; CHECK-NEXT: xvld $xr1, $a1, 0
+; CHECK-NEXT: xvpickve.d $xr2, $xr0, 3
+; CHECK-NEXT: xvpickve.d $xr3, $xr1, 3
+; CHECK-NEXT: fmax.d $fa2, $fa3, $fa2
+; CHECK-NEXT: xvpickve.d $xr3, $xr0, 2
+; CHECK-NEXT: xvpickve.d $xr4, $xr1, 2
+; CHECK-NEXT: fmax.d $fa3, $fa4, $fa3
+; CHECK-NEXT: vextrins.d $vr3, $vr2, 16
+; CHECK-NEXT: xvpickve.d $xr2, $xr0, 1
+; CHECK-NEXT: xvpickve.d $xr4, $xr1, 1
+; CHECK-NEXT: fmax.d $fa2, $fa4, $fa2
+; CHECK-NEXT: xvpickve.d $xr0, $xr0, 0
+; CHECK-NEXT: xvpickve.d $xr1, $xr1, 0
+; CHECK-NEXT: fmax.d $fa0, $fa1, $fa0
+; CHECK-NEXT: vextrins.d $vr0, $vr2, 16
+; CHECK-NEXT: xvpermi.q $xr0, $xr3, 2
+; CHECK-NEXT: xvst $xr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %v0 = load <4 x double>, ptr %x
+ %v1 = load <4 x double>, ptr %y
+ %r = call <4 x double> @llvm.maxnum.v4f64(<4 x double> %v0, <4 x double> %v1)
+ store <4 x double> %r, ptr %res
+ ret void
+}
+
+declare <8 x float> @llvm.minnum.v8f32(<8 x float>, <8 x float>)
+declare <4 x double> @llvm.minnum.v4f64(<4 x double>, <4 x double>)
+declare <8 x float> @llvm.maxnum.v8f32(<8 x float>, <8 x float>)
+declare <4 x double> @llvm.maxnum.v4f64(<4 x double>, <4 x double>)
diff --git a/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll b/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll
new file mode 100644
index 0000000..27ecb75
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/fp-max-min.ll
@@ -0,0 +1,112 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 6
+; RUN: llc --mtriple=loongarch32 --mattr=+32s,+lsx < %s | FileCheck %s
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx < %s | FileCheck %s
+
+define void @minnum_v4f32(ptr %res, ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: minnum_v4f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vld $vr0, $a2, 0
+; CHECK-NEXT: vld $vr1, $a1, 0
+; CHECK-NEXT: vreplvei.w $vr2, $vr0, 1
+; CHECK-NEXT: vreplvei.w $vr3, $vr1, 1
+; CHECK-NEXT: fmin.s $fa2, $fa3, $fa2
+; CHECK-NEXT: vreplvei.w $vr3, $vr0, 0
+; CHECK-NEXT: vreplvei.w $vr4, $vr1, 0
+; CHECK-NEXT: fmin.s $fa3, $fa4, $fa3
+; CHECK-NEXT: vextrins.w $vr3, $vr2, 16
+; CHECK-NEXT: vreplvei.w $vr2, $vr0, 2
+; CHECK-NEXT: vreplvei.w $vr4, $vr1, 2
+; CHECK-NEXT: fmin.s $fa2, $fa4, $fa2
+; CHECK-NEXT: vextrins.w $vr3, $vr2, 32
+; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3
+; CHECK-NEXT: vreplvei.w $vr1, $vr1, 3
+; CHECK-NEXT: fmin.s $fa0, $fa1, $fa0
+; CHECK-NEXT: vextrins.w $vr3, $vr0, 48
+; CHECK-NEXT: vst $vr3, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %v0 = load <4 x float>, ptr %x
+ %v1 = load <4 x float>, ptr %y
+ %r = call <4 x float> @llvm.minnum.v4f32(<4 x float> %v0, <4 x float> %v1)
+ store <4 x float> %r, ptr %res
+ ret void
+}
+
+define void @minnum_v2f64(ptr %res, ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: minnum_v2f64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vld $vr0, $a2, 0
+; CHECK-NEXT: vld $vr1, $a1, 0
+; CHECK-NEXT: vreplvei.d $vr2, $vr0, 1
+; CHECK-NEXT: vreplvei.d $vr3, $vr1, 1
+; CHECK-NEXT: fmin.d $fa2, $fa3, $fa2
+; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
+; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
+; CHECK-NEXT: fmin.d $fa0, $fa1, $fa0
+; CHECK-NEXT: vextrins.d $vr0, $vr2, 16
+; CHECK-NEXT: vst $vr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %v0 = load <2 x double>, ptr %x
+ %v1 = load <2 x double>, ptr %y
+ %r = call <2 x double> @llvm.minnum.v2f64(<2 x double> %v0, <2 x double> %v1)
+ store <2 x double> %r, ptr %res
+ ret void
+}
+
+define void @maxnum_v4f32(ptr %res, ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: maxnum_v4f32:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vld $vr0, $a2, 0
+; CHECK-NEXT: vld $vr1, $a1, 0
+; CHECK-NEXT: vreplvei.w $vr2, $vr0, 1
+; CHECK-NEXT: vreplvei.w $vr3, $vr1, 1
+; CHECK-NEXT: fmax.s $fa2, $fa3, $fa2
+; CHECK-NEXT: vreplvei.w $vr3, $vr0, 0
+; CHECK-NEXT: vreplvei.w $vr4, $vr1, 0
+; CHECK-NEXT: fmax.s $fa3, $fa4, $fa3
+; CHECK-NEXT: vextrins.w $vr3, $vr2, 16
+; CHECK-NEXT: vreplvei.w $vr2, $vr0, 2
+; CHECK-NEXT: vreplvei.w $vr4, $vr1, 2
+; CHECK-NEXT: fmax.s $fa2, $fa4, $fa2
+; CHECK-NEXT: vextrins.w $vr3, $vr2, 32
+; CHECK-NEXT: vreplvei.w $vr0, $vr0, 3
+; CHECK-NEXT: vreplvei.w $vr1, $vr1, 3
+; CHECK-NEXT: fmax.s $fa0, $fa1, $fa0
+; CHECK-NEXT: vextrins.w $vr3, $vr0, 48
+; CHECK-NEXT: vst $vr3, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %v0 = load <4 x float>, ptr %x
+ %v1 = load <4 x float>, ptr %y
+ %r = call <4 x float> @llvm.maxnum.v4f32(<4 x float> %v0, <4 x float> %v1)
+ store <4 x float> %r, ptr %res
+ ret void
+}
+
+define void @maxnum_v2f64(ptr %res, ptr %x, ptr %y) nounwind {
+; CHECK-LABEL: maxnum_v2f64:
+; CHECK: # %bb.0: # %entry
+; CHECK-NEXT: vld $vr0, $a2, 0
+; CHECK-NEXT: vld $vr1, $a1, 0
+; CHECK-NEXT: vreplvei.d $vr2, $vr0, 1
+; CHECK-NEXT: vreplvei.d $vr3, $vr1, 1
+; CHECK-NEXT: fmax.d $fa2, $fa3, $fa2
+; CHECK-NEXT: vreplvei.d $vr0, $vr0, 0
+; CHECK-NEXT: vreplvei.d $vr1, $vr1, 0
+; CHECK-NEXT: fmax.d $fa0, $fa1, $fa0
+; CHECK-NEXT: vextrins.d $vr0, $vr2, 16
+; CHECK-NEXT: vst $vr0, $a0, 0
+; CHECK-NEXT: ret
+entry:
+ %v0 = load <2 x double>, ptr %x
+ %v1 = load <2 x double>, ptr %y
+ %r = call <2 x double> @llvm.maxnum.v2f64(<2 x double> %v0, <2 x double> %v1)
+ store <2 x double> %r, ptr %res
+ ret void
+}
+
+declare <4 x float> @llvm.minnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.minnum.v2f64(<2 x double>, <2 x double>)
+declare <4 x float> @llvm.maxnum.v4f32(<4 x float>, <4 x float>)
+declare <2 x double> @llvm.maxnum.v2f64(<2 x double>, <2 x double>)
diff --git a/llvm/test/CodeGen/RISCV/atomic-rmw-minmax.ll b/llvm/test/CodeGen/RISCV/atomic-rmw-minmax.ll
new file mode 100644
index 0000000..b43555c6
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/atomic-rmw-minmax.ll
@@ -0,0 +1,642 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=riscv32 -mattr=+b,+zalrsc -verify-machineinstrs < %s \
+; RUN: | FileCheck -check-prefixes=RV32IB-COMMON,RV32IB-ZALRSC %s
+; RUN: llc -mtriple=riscv32 -mattr=+b,+zalrsc,+permissive-zalrsc -verify-machineinstrs < %s \
+; RUN: | FileCheck -check-prefixes=RV32IB-COMMON,RV32IB-ZALRSC-PERM %s
+; RUN: llc -mtriple=riscv32 -mattr=+b,+zalrsc,+permissive-zalrsc,+a -verify-machineinstrs < %s \
+; RUN: | FileCheck -check-prefixes=RV32IB-COMMON,RV32IAB %s
+;
+; RUN: llc -mtriple=riscv64 -mattr=+b,+zalrsc -verify-machineinstrs < %s \
+; RUN: | FileCheck -check-prefixes=RV64IB-ZALRSC %s
+; RUN: llc -mtriple=riscv64 -mattr=+b,+zalrsc,+permissive-zalrsc -verify-machineinstrs < %s \
+; RUN: | FileCheck -check-prefixes=RV64IB-ZALRSC-PERM %s
+; RUN: llc -mtriple=riscv64 -mattr=+b,+zalrsc,+permissive-zalrsc,+a -verify-machineinstrs < %s \
+; RUN: | FileCheck -check-prefixes=RV64IAB %s
+
+define i32 @atomicrmw_max_i32_seq_cst(ptr %a, i32 %b) nounwind {
+; RV32IB-ZALRSC-LABEL: atomicrmw_max_i32_seq_cst:
+; RV32IB-ZALRSC: # %bb.0:
+; RV32IB-ZALRSC-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1
+; RV32IB-ZALRSC-NEXT: lr.w.aqrl a2, (a0)
+; RV32IB-ZALRSC-NEXT: mv a3, a2
+; RV32IB-ZALRSC-NEXT: bge a3, a1, .LBB0_3
+; RV32IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB0_1 Depth=1
+; RV32IB-ZALRSC-NEXT: mv a3, a1
+; RV32IB-ZALRSC-NEXT: .LBB0_3: # in Loop: Header=BB0_1 Depth=1
+; RV32IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0)
+; RV32IB-ZALRSC-NEXT: bnez a3, .LBB0_1
+; RV32IB-ZALRSC-NEXT: # %bb.4:
+; RV32IB-ZALRSC-NEXT: mv a0, a2
+; RV32IB-ZALRSC-NEXT: ret
+;
+; RV32IB-ZALRSC-PERM-LABEL: atomicrmw_max_i32_seq_cst:
+; RV32IB-ZALRSC-PERM: # %bb.0:
+; RV32IB-ZALRSC-PERM-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1
+; RV32IB-ZALRSC-PERM-NEXT: lr.w.aqrl a2, (a0)
+; RV32IB-ZALRSC-PERM-NEXT: max a3, a2, a1
+; RV32IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0)
+; RV32IB-ZALRSC-PERM-NEXT: bnez a3, .LBB0_1
+; RV32IB-ZALRSC-PERM-NEXT: # %bb.2:
+; RV32IB-ZALRSC-PERM-NEXT: mv a0, a2
+; RV32IB-ZALRSC-PERM-NEXT: ret
+;
+; RV32IAB-LABEL: atomicrmw_max_i32_seq_cst:
+; RV32IAB: # %bb.0:
+; RV32IAB-NEXT: amomax.w.aqrl a0, a1, (a0)
+; RV32IAB-NEXT: ret
+;
+; RV64IB-ZALRSC-LABEL: atomicrmw_max_i32_seq_cst:
+; RV64IB-ZALRSC: # %bb.0:
+; RV64IB-ZALRSC-NEXT: sext.w a2, a1
+; RV64IB-ZALRSC-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-NEXT: lr.w.aqrl a1, (a0)
+; RV64IB-ZALRSC-NEXT: mv a3, a1
+; RV64IB-ZALRSC-NEXT: bge a3, a2, .LBB0_3
+; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB0_1 Depth=1
+; RV64IB-ZALRSC-NEXT: mv a3, a2
+; RV64IB-ZALRSC-NEXT: .LBB0_3: # in Loop: Header=BB0_1 Depth=1
+; RV64IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0)
+; RV64IB-ZALRSC-NEXT: bnez a3, .LBB0_1
+; RV64IB-ZALRSC-NEXT: # %bb.4:
+; RV64IB-ZALRSC-NEXT: mv a0, a1
+; RV64IB-ZALRSC-NEXT: ret
+;
+; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_max_i32_seq_cst:
+; RV64IB-ZALRSC-PERM: # %bb.0:
+; RV64IB-ZALRSC-PERM-NEXT: sext.w a2, a1
+; RV64IB-ZALRSC-PERM-NEXT: .LBB0_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-PERM-NEXT: lr.w.aqrl a1, (a0)
+; RV64IB-ZALRSC-PERM-NEXT: max a3, a1, a2
+; RV64IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0)
+; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB0_1
+; RV64IB-ZALRSC-PERM-NEXT: # %bb.2:
+; RV64IB-ZALRSC-PERM-NEXT: mv a0, a1
+; RV64IB-ZALRSC-PERM-NEXT: ret
+;
+; RV64IAB-LABEL: atomicrmw_max_i32_seq_cst:
+; RV64IAB: # %bb.0:
+; RV64IAB-NEXT: amomax.w.aqrl a0, a1, (a0)
+; RV64IAB-NEXT: ret
+ %1 = atomicrmw max ptr %a, i32 %b seq_cst
+ ret i32 %1
+}
+
+define i32 @atomicrmw_min_i32_seq_cst(ptr %a, i32 %b) nounwind {
+; RV32IB-ZALRSC-LABEL: atomicrmw_min_i32_seq_cst:
+; RV32IB-ZALRSC: # %bb.0:
+; RV32IB-ZALRSC-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1
+; RV32IB-ZALRSC-NEXT: lr.w.aqrl a2, (a0)
+; RV32IB-ZALRSC-NEXT: mv a3, a2
+; RV32IB-ZALRSC-NEXT: bge a1, a3, .LBB1_3
+; RV32IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB1_1 Depth=1
+; RV32IB-ZALRSC-NEXT: mv a3, a1
+; RV32IB-ZALRSC-NEXT: .LBB1_3: # in Loop: Header=BB1_1 Depth=1
+; RV32IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0)
+; RV32IB-ZALRSC-NEXT: bnez a3, .LBB1_1
+; RV32IB-ZALRSC-NEXT: # %bb.4:
+; RV32IB-ZALRSC-NEXT: mv a0, a2
+; RV32IB-ZALRSC-NEXT: ret
+;
+; RV32IB-ZALRSC-PERM-LABEL: atomicrmw_min_i32_seq_cst:
+; RV32IB-ZALRSC-PERM: # %bb.0:
+; RV32IB-ZALRSC-PERM-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1
+; RV32IB-ZALRSC-PERM-NEXT: lr.w.aqrl a2, (a0)
+; RV32IB-ZALRSC-PERM-NEXT: min a3, a2, a1
+; RV32IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0)
+; RV32IB-ZALRSC-PERM-NEXT: bnez a3, .LBB1_1
+; RV32IB-ZALRSC-PERM-NEXT: # %bb.2:
+; RV32IB-ZALRSC-PERM-NEXT: mv a0, a2
+; RV32IB-ZALRSC-PERM-NEXT: ret
+;
+; RV32IAB-LABEL: atomicrmw_min_i32_seq_cst:
+; RV32IAB: # %bb.0:
+; RV32IAB-NEXT: amomin.w.aqrl a0, a1, (a0)
+; RV32IAB-NEXT: ret
+;
+; RV64IB-ZALRSC-LABEL: atomicrmw_min_i32_seq_cst:
+; RV64IB-ZALRSC: # %bb.0:
+; RV64IB-ZALRSC-NEXT: sext.w a2, a1
+; RV64IB-ZALRSC-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-NEXT: lr.w.aqrl a1, (a0)
+; RV64IB-ZALRSC-NEXT: mv a3, a1
+; RV64IB-ZALRSC-NEXT: bge a2, a3, .LBB1_3
+; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB1_1 Depth=1
+; RV64IB-ZALRSC-NEXT: mv a3, a2
+; RV64IB-ZALRSC-NEXT: .LBB1_3: # in Loop: Header=BB1_1 Depth=1
+; RV64IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0)
+; RV64IB-ZALRSC-NEXT: bnez a3, .LBB1_1
+; RV64IB-ZALRSC-NEXT: # %bb.4:
+; RV64IB-ZALRSC-NEXT: mv a0, a1
+; RV64IB-ZALRSC-NEXT: ret
+;
+; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_min_i32_seq_cst:
+; RV64IB-ZALRSC-PERM: # %bb.0:
+; RV64IB-ZALRSC-PERM-NEXT: sext.w a2, a1
+; RV64IB-ZALRSC-PERM-NEXT: .LBB1_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-PERM-NEXT: lr.w.aqrl a1, (a0)
+; RV64IB-ZALRSC-PERM-NEXT: min a3, a1, a2
+; RV64IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0)
+; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB1_1
+; RV64IB-ZALRSC-PERM-NEXT: # %bb.2:
+; RV64IB-ZALRSC-PERM-NEXT: mv a0, a1
+; RV64IB-ZALRSC-PERM-NEXT: ret
+;
+; RV64IAB-LABEL: atomicrmw_min_i32_seq_cst:
+; RV64IAB: # %bb.0:
+; RV64IAB-NEXT: amomin.w.aqrl a0, a1, (a0)
+; RV64IAB-NEXT: ret
+ %1 = atomicrmw min ptr %a, i32 %b seq_cst
+ ret i32 %1
+}
+
+define i32 @atomicrmw_umax_i32_seq_cst(ptr %a, i32 %b) nounwind {
+; RV32IB-ZALRSC-LABEL: atomicrmw_umax_i32_seq_cst:
+; RV32IB-ZALRSC: # %bb.0:
+; RV32IB-ZALRSC-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV32IB-ZALRSC-NEXT: lr.w.aqrl a2, (a0)
+; RV32IB-ZALRSC-NEXT: mv a3, a2
+; RV32IB-ZALRSC-NEXT: bgeu a3, a1, .LBB2_3
+; RV32IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB2_1 Depth=1
+; RV32IB-ZALRSC-NEXT: mv a3, a1
+; RV32IB-ZALRSC-NEXT: .LBB2_3: # in Loop: Header=BB2_1 Depth=1
+; RV32IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0)
+; RV32IB-ZALRSC-NEXT: bnez a3, .LBB2_1
+; RV32IB-ZALRSC-NEXT: # %bb.4:
+; RV32IB-ZALRSC-NEXT: mv a0, a2
+; RV32IB-ZALRSC-NEXT: ret
+;
+; RV32IB-ZALRSC-PERM-LABEL: atomicrmw_umax_i32_seq_cst:
+; RV32IB-ZALRSC-PERM: # %bb.0:
+; RV32IB-ZALRSC-PERM-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV32IB-ZALRSC-PERM-NEXT: lr.w.aqrl a2, (a0)
+; RV32IB-ZALRSC-PERM-NEXT: maxu a3, a2, a1
+; RV32IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0)
+; RV32IB-ZALRSC-PERM-NEXT: bnez a3, .LBB2_1
+; RV32IB-ZALRSC-PERM-NEXT: # %bb.2:
+; RV32IB-ZALRSC-PERM-NEXT: mv a0, a2
+; RV32IB-ZALRSC-PERM-NEXT: ret
+;
+; RV32IAB-LABEL: atomicrmw_umax_i32_seq_cst:
+; RV32IAB: # %bb.0:
+; RV32IAB-NEXT: amomaxu.w.aqrl a0, a1, (a0)
+; RV32IAB-NEXT: ret
+;
+; RV64IB-ZALRSC-LABEL: atomicrmw_umax_i32_seq_cst:
+; RV64IB-ZALRSC: # %bb.0:
+; RV64IB-ZALRSC-NEXT: sext.w a2, a1
+; RV64IB-ZALRSC-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-NEXT: lr.w.aqrl a1, (a0)
+; RV64IB-ZALRSC-NEXT: mv a3, a1
+; RV64IB-ZALRSC-NEXT: bgeu a3, a2, .LBB2_3
+; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB2_1 Depth=1
+; RV64IB-ZALRSC-NEXT: mv a3, a2
+; RV64IB-ZALRSC-NEXT: .LBB2_3: # in Loop: Header=BB2_1 Depth=1
+; RV64IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0)
+; RV64IB-ZALRSC-NEXT: bnez a3, .LBB2_1
+; RV64IB-ZALRSC-NEXT: # %bb.4:
+; RV64IB-ZALRSC-NEXT: mv a0, a1
+; RV64IB-ZALRSC-NEXT: ret
+;
+; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_umax_i32_seq_cst:
+; RV64IB-ZALRSC-PERM: # %bb.0:
+; RV64IB-ZALRSC-PERM-NEXT: sext.w a2, a1
+; RV64IB-ZALRSC-PERM-NEXT: .LBB2_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-PERM-NEXT: lr.w.aqrl a1, (a0)
+; RV64IB-ZALRSC-PERM-NEXT: maxu a3, a1, a2
+; RV64IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0)
+; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB2_1
+; RV64IB-ZALRSC-PERM-NEXT: # %bb.2:
+; RV64IB-ZALRSC-PERM-NEXT: mv a0, a1
+; RV64IB-ZALRSC-PERM-NEXT: ret
+;
+; RV64IAB-LABEL: atomicrmw_umax_i32_seq_cst:
+; RV64IAB: # %bb.0:
+; RV64IAB-NEXT: amomaxu.w.aqrl a0, a1, (a0)
+; RV64IAB-NEXT: ret
+ %1 = atomicrmw umax ptr %a, i32 %b seq_cst
+ ret i32 %1
+}
+
+define i32 @atomicrmw_umin_i32_seq_cst(ptr %a, i32 %b) nounwind {
+; RV32IB-ZALRSC-LABEL: atomicrmw_umin_i32_seq_cst:
+; RV32IB-ZALRSC: # %bb.0:
+; RV32IB-ZALRSC-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV32IB-ZALRSC-NEXT: lr.w.aqrl a2, (a0)
+; RV32IB-ZALRSC-NEXT: mv a3, a2
+; RV32IB-ZALRSC-NEXT: bgeu a1, a3, .LBB3_3
+; RV32IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1
+; RV32IB-ZALRSC-NEXT: mv a3, a1
+; RV32IB-ZALRSC-NEXT: .LBB3_3: # in Loop: Header=BB3_1 Depth=1
+; RV32IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0)
+; RV32IB-ZALRSC-NEXT: bnez a3, .LBB3_1
+; RV32IB-ZALRSC-NEXT: # %bb.4:
+; RV32IB-ZALRSC-NEXT: mv a0, a2
+; RV32IB-ZALRSC-NEXT: ret
+;
+; RV32IB-ZALRSC-PERM-LABEL: atomicrmw_umin_i32_seq_cst:
+; RV32IB-ZALRSC-PERM: # %bb.0:
+; RV32IB-ZALRSC-PERM-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV32IB-ZALRSC-PERM-NEXT: lr.w.aqrl a2, (a0)
+; RV32IB-ZALRSC-PERM-NEXT: minu a3, a2, a1
+; RV32IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0)
+; RV32IB-ZALRSC-PERM-NEXT: bnez a3, .LBB3_1
+; RV32IB-ZALRSC-PERM-NEXT: # %bb.2:
+; RV32IB-ZALRSC-PERM-NEXT: mv a0, a2
+; RV32IB-ZALRSC-PERM-NEXT: ret
+;
+; RV32IAB-LABEL: atomicrmw_umin_i32_seq_cst:
+; RV32IAB: # %bb.0:
+; RV32IAB-NEXT: amominu.w.aqrl a0, a1, (a0)
+; RV32IAB-NEXT: ret
+;
+; RV64IB-ZALRSC-LABEL: atomicrmw_umin_i32_seq_cst:
+; RV64IB-ZALRSC: # %bb.0:
+; RV64IB-ZALRSC-NEXT: sext.w a2, a1
+; RV64IB-ZALRSC-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-NEXT: lr.w.aqrl a1, (a0)
+; RV64IB-ZALRSC-NEXT: mv a3, a1
+; RV64IB-ZALRSC-NEXT: bgeu a2, a3, .LBB3_3
+; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB3_1 Depth=1
+; RV64IB-ZALRSC-NEXT: mv a3, a2
+; RV64IB-ZALRSC-NEXT: .LBB3_3: # in Loop: Header=BB3_1 Depth=1
+; RV64IB-ZALRSC-NEXT: sc.w.rl a3, a3, (a0)
+; RV64IB-ZALRSC-NEXT: bnez a3, .LBB3_1
+; RV64IB-ZALRSC-NEXT: # %bb.4:
+; RV64IB-ZALRSC-NEXT: mv a0, a1
+; RV64IB-ZALRSC-NEXT: ret
+;
+; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_umin_i32_seq_cst:
+; RV64IB-ZALRSC-PERM: # %bb.0:
+; RV64IB-ZALRSC-PERM-NEXT: sext.w a2, a1
+; RV64IB-ZALRSC-PERM-NEXT: .LBB3_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-PERM-NEXT: lr.w.aqrl a1, (a0)
+; RV64IB-ZALRSC-PERM-NEXT: minu a3, a1, a2
+; RV64IB-ZALRSC-PERM-NEXT: sc.w.rl a3, a3, (a0)
+; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB3_1
+; RV64IB-ZALRSC-PERM-NEXT: # %bb.2:
+; RV64IB-ZALRSC-PERM-NEXT: mv a0, a1
+; RV64IB-ZALRSC-PERM-NEXT: ret
+;
+; RV64IAB-LABEL: atomicrmw_umin_i32_seq_cst:
+; RV64IAB: # %bb.0:
+; RV64IAB-NEXT: amominu.w.aqrl a0, a1, (a0)
+; RV64IAB-NEXT: ret
+ %1 = atomicrmw umin ptr %a, i32 %b seq_cst
+ ret i32 %1
+}
+
+define i64 @atomicrmw_max_i64_seq_cst(ptr %a, i64 %b) nounwind {
+; RV32IB-COMMON-LABEL: atomicrmw_max_i64_seq_cst:
+; RV32IB-COMMON: # %bb.0:
+; RV32IB-COMMON-NEXT: addi sp, sp, -32
+; RV32IB-COMMON-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT: mv s0, a2
+; RV32IB-COMMON-NEXT: mv s1, a0
+; RV32IB-COMMON-NEXT: lw a4, 0(a0)
+; RV32IB-COMMON-NEXT: lw a5, 4(a0)
+; RV32IB-COMMON-NEXT: mv s2, a1
+; RV32IB-COMMON-NEXT: j .LBB4_2
+; RV32IB-COMMON-NEXT: .LBB4_1: # %atomicrmw.start
+; RV32IB-COMMON-NEXT: # in Loop: Header=BB4_2 Depth=1
+; RV32IB-COMMON-NEXT: sw a4, 8(sp)
+; RV32IB-COMMON-NEXT: sw a5, 12(sp)
+; RV32IB-COMMON-NEXT: addi a1, sp, 8
+; RV32IB-COMMON-NEXT: li a4, 5
+; RV32IB-COMMON-NEXT: li a5, 5
+; RV32IB-COMMON-NEXT: mv a0, s1
+; RV32IB-COMMON-NEXT: call __atomic_compare_exchange_8
+; RV32IB-COMMON-NEXT: lw a4, 8(sp)
+; RV32IB-COMMON-NEXT: lw a5, 12(sp)
+; RV32IB-COMMON-NEXT: bnez a0, .LBB4_7
+; RV32IB-COMMON-NEXT: .LBB4_2: # %atomicrmw.start
+; RV32IB-COMMON-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32IB-COMMON-NEXT: beq a5, s0, .LBB4_4
+; RV32IB-COMMON-NEXT: # %bb.3: # %atomicrmw.start
+; RV32IB-COMMON-NEXT: # in Loop: Header=BB4_2 Depth=1
+; RV32IB-COMMON-NEXT: slt a0, s0, a5
+; RV32IB-COMMON-NEXT: j .LBB4_5
+; RV32IB-COMMON-NEXT: .LBB4_4: # in Loop: Header=BB4_2 Depth=1
+; RV32IB-COMMON-NEXT: sltu a0, s2, a4
+; RV32IB-COMMON-NEXT: .LBB4_5: # %atomicrmw.start
+; RV32IB-COMMON-NEXT: # in Loop: Header=BB4_2 Depth=1
+; RV32IB-COMMON-NEXT: mv a2, a4
+; RV32IB-COMMON-NEXT: mv a3, a5
+; RV32IB-COMMON-NEXT: bnez a0, .LBB4_1
+; RV32IB-COMMON-NEXT: # %bb.6: # %atomicrmw.start
+; RV32IB-COMMON-NEXT: # in Loop: Header=BB4_2 Depth=1
+; RV32IB-COMMON-NEXT: mv a2, s2
+; RV32IB-COMMON-NEXT: mv a3, s0
+; RV32IB-COMMON-NEXT: j .LBB4_1
+; RV32IB-COMMON-NEXT: .LBB4_7: # %atomicrmw.end
+; RV32IB-COMMON-NEXT: mv a0, a4
+; RV32IB-COMMON-NEXT: mv a1, a5
+; RV32IB-COMMON-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT: addi sp, sp, 32
+; RV32IB-COMMON-NEXT: ret
+;
+; RV64IB-ZALRSC-LABEL: atomicrmw_max_i64_seq_cst:
+; RV64IB-ZALRSC: # %bb.0:
+; RV64IB-ZALRSC-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-NEXT: lr.d.aqrl a2, (a0)
+; RV64IB-ZALRSC-NEXT: mv a3, a2
+; RV64IB-ZALRSC-NEXT: bge a3, a1, .LBB4_3
+; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB4_1 Depth=1
+; RV64IB-ZALRSC-NEXT: mv a3, a1
+; RV64IB-ZALRSC-NEXT: .LBB4_3: # in Loop: Header=BB4_1 Depth=1
+; RV64IB-ZALRSC-NEXT: sc.d.rl a3, a3, (a0)
+; RV64IB-ZALRSC-NEXT: bnez a3, .LBB4_1
+; RV64IB-ZALRSC-NEXT: # %bb.4:
+; RV64IB-ZALRSC-NEXT: mv a0, a2
+; RV64IB-ZALRSC-NEXT: ret
+;
+; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_max_i64_seq_cst:
+; RV64IB-ZALRSC-PERM: # %bb.0:
+; RV64IB-ZALRSC-PERM-NEXT: .LBB4_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-PERM-NEXT: lr.d.aqrl a2, (a0)
+; RV64IB-ZALRSC-PERM-NEXT: max a3, a2, a1
+; RV64IB-ZALRSC-PERM-NEXT: sc.d.rl a3, a3, (a0)
+; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB4_1
+; RV64IB-ZALRSC-PERM-NEXT: # %bb.2:
+; RV64IB-ZALRSC-PERM-NEXT: mv a0, a2
+; RV64IB-ZALRSC-PERM-NEXT: ret
+;
+; RV64IAB-LABEL: atomicrmw_max_i64_seq_cst:
+; RV64IAB: # %bb.0:
+; RV64IAB-NEXT: amomax.d.aqrl a0, a1, (a0)
+; RV64IAB-NEXT: ret
+ %1 = atomicrmw max ptr %a, i64 %b seq_cst
+ ret i64 %1
+}
+
+define i64 @atomicrmw_min_i64_seq_cst(ptr %a, i64 %b) nounwind {
+; RV32IB-COMMON-LABEL: atomicrmw_min_i64_seq_cst:
+; RV32IB-COMMON: # %bb.0:
+; RV32IB-COMMON-NEXT: addi sp, sp, -32
+; RV32IB-COMMON-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT: mv s0, a2
+; RV32IB-COMMON-NEXT: mv s1, a0
+; RV32IB-COMMON-NEXT: lw a4, 0(a0)
+; RV32IB-COMMON-NEXT: lw a5, 4(a0)
+; RV32IB-COMMON-NEXT: mv s2, a1
+; RV32IB-COMMON-NEXT: j .LBB5_2
+; RV32IB-COMMON-NEXT: .LBB5_1: # %atomicrmw.start
+; RV32IB-COMMON-NEXT: # in Loop: Header=BB5_2 Depth=1
+; RV32IB-COMMON-NEXT: sw a4, 8(sp)
+; RV32IB-COMMON-NEXT: sw a5, 12(sp)
+; RV32IB-COMMON-NEXT: addi a1, sp, 8
+; RV32IB-COMMON-NEXT: li a4, 5
+; RV32IB-COMMON-NEXT: li a5, 5
+; RV32IB-COMMON-NEXT: mv a0, s1
+; RV32IB-COMMON-NEXT: call __atomic_compare_exchange_8
+; RV32IB-COMMON-NEXT: lw a4, 8(sp)
+; RV32IB-COMMON-NEXT: lw a5, 12(sp)
+; RV32IB-COMMON-NEXT: bnez a0, .LBB5_7
+; RV32IB-COMMON-NEXT: .LBB5_2: # %atomicrmw.start
+; RV32IB-COMMON-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32IB-COMMON-NEXT: beq a5, s0, .LBB5_4
+; RV32IB-COMMON-NEXT: # %bb.3: # %atomicrmw.start
+; RV32IB-COMMON-NEXT: # in Loop: Header=BB5_2 Depth=1
+; RV32IB-COMMON-NEXT: slt a0, a5, s0
+; RV32IB-COMMON-NEXT: j .LBB5_5
+; RV32IB-COMMON-NEXT: .LBB5_4: # in Loop: Header=BB5_2 Depth=1
+; RV32IB-COMMON-NEXT: sltu a0, a4, s2
+; RV32IB-COMMON-NEXT: .LBB5_5: # %atomicrmw.start
+; RV32IB-COMMON-NEXT: # in Loop: Header=BB5_2 Depth=1
+; RV32IB-COMMON-NEXT: mv a2, a4
+; RV32IB-COMMON-NEXT: mv a3, a5
+; RV32IB-COMMON-NEXT: bnez a0, .LBB5_1
+; RV32IB-COMMON-NEXT: # %bb.6: # %atomicrmw.start
+; RV32IB-COMMON-NEXT: # in Loop: Header=BB5_2 Depth=1
+; RV32IB-COMMON-NEXT: mv a2, s2
+; RV32IB-COMMON-NEXT: mv a3, s0
+; RV32IB-COMMON-NEXT: j .LBB5_1
+; RV32IB-COMMON-NEXT: .LBB5_7: # %atomicrmw.end
+; RV32IB-COMMON-NEXT: mv a0, a4
+; RV32IB-COMMON-NEXT: mv a1, a5
+; RV32IB-COMMON-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT: addi sp, sp, 32
+; RV32IB-COMMON-NEXT: ret
+;
+; RV64IB-ZALRSC-LABEL: atomicrmw_min_i64_seq_cst:
+; RV64IB-ZALRSC: # %bb.0:
+; RV64IB-ZALRSC-NEXT: .LBB5_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-NEXT: lr.d.aqrl a2, (a0)
+; RV64IB-ZALRSC-NEXT: mv a3, a2
+; RV64IB-ZALRSC-NEXT: bge a1, a3, .LBB5_3
+; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB5_1 Depth=1
+; RV64IB-ZALRSC-NEXT: mv a3, a1
+; RV64IB-ZALRSC-NEXT: .LBB5_3: # in Loop: Header=BB5_1 Depth=1
+; RV64IB-ZALRSC-NEXT: sc.d.rl a3, a3, (a0)
+; RV64IB-ZALRSC-NEXT: bnez a3, .LBB5_1
+; RV64IB-ZALRSC-NEXT: # %bb.4:
+; RV64IB-ZALRSC-NEXT: mv a0, a2
+; RV64IB-ZALRSC-NEXT: ret
+;
+; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_min_i64_seq_cst:
+; RV64IB-ZALRSC-PERM: # %bb.0:
+; RV64IB-ZALRSC-PERM-NEXT: .LBB5_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-PERM-NEXT: lr.d.aqrl a2, (a0)
+; RV64IB-ZALRSC-PERM-NEXT: min a3, a2, a1
+; RV64IB-ZALRSC-PERM-NEXT: sc.d.rl a3, a3, (a0)
+; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB5_1
+; RV64IB-ZALRSC-PERM-NEXT: # %bb.2:
+; RV64IB-ZALRSC-PERM-NEXT: mv a0, a2
+; RV64IB-ZALRSC-PERM-NEXT: ret
+;
+; RV64IAB-LABEL: atomicrmw_min_i64_seq_cst:
+; RV64IAB: # %bb.0:
+; RV64IAB-NEXT: amomin.d.aqrl a0, a1, (a0)
+; RV64IAB-NEXT: ret
+ %1 = atomicrmw min ptr %a, i64 %b seq_cst
+ ret i64 %1
+}
+
+define i64 @atomicrmw_umax_i64_seq_cst(ptr %a, i64 %b) nounwind {
+; RV32IB-COMMON-LABEL: atomicrmw_umax_i64_seq_cst:
+; RV32IB-COMMON: # %bb.0:
+; RV32IB-COMMON-NEXT: addi sp, sp, -32
+; RV32IB-COMMON-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT: mv s0, a2
+; RV32IB-COMMON-NEXT: mv s1, a0
+; RV32IB-COMMON-NEXT: lw a4, 0(a0)
+; RV32IB-COMMON-NEXT: lw a5, 4(a0)
+; RV32IB-COMMON-NEXT: mv s2, a1
+; RV32IB-COMMON-NEXT: j .LBB6_2
+; RV32IB-COMMON-NEXT: .LBB6_1: # %atomicrmw.start
+; RV32IB-COMMON-NEXT: # in Loop: Header=BB6_2 Depth=1
+; RV32IB-COMMON-NEXT: sw a4, 8(sp)
+; RV32IB-COMMON-NEXT: sw a5, 12(sp)
+; RV32IB-COMMON-NEXT: addi a1, sp, 8
+; RV32IB-COMMON-NEXT: li a4, 5
+; RV32IB-COMMON-NEXT: li a5, 5
+; RV32IB-COMMON-NEXT: mv a0, s1
+; RV32IB-COMMON-NEXT: call __atomic_compare_exchange_8
+; RV32IB-COMMON-NEXT: lw a4, 8(sp)
+; RV32IB-COMMON-NEXT: lw a5, 12(sp)
+; RV32IB-COMMON-NEXT: bnez a0, .LBB6_7
+; RV32IB-COMMON-NEXT: .LBB6_2: # %atomicrmw.start
+; RV32IB-COMMON-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32IB-COMMON-NEXT: beq a5, s0, .LBB6_4
+; RV32IB-COMMON-NEXT: # %bb.3: # %atomicrmw.start
+; RV32IB-COMMON-NEXT: # in Loop: Header=BB6_2 Depth=1
+; RV32IB-COMMON-NEXT: sltu a0, s0, a5
+; RV32IB-COMMON-NEXT: j .LBB6_5
+; RV32IB-COMMON-NEXT: .LBB6_4: # in Loop: Header=BB6_2 Depth=1
+; RV32IB-COMMON-NEXT: sltu a0, s2, a4
+; RV32IB-COMMON-NEXT: .LBB6_5: # %atomicrmw.start
+; RV32IB-COMMON-NEXT: # in Loop: Header=BB6_2 Depth=1
+; RV32IB-COMMON-NEXT: mv a2, a4
+; RV32IB-COMMON-NEXT: mv a3, a5
+; RV32IB-COMMON-NEXT: bnez a0, .LBB6_1
+; RV32IB-COMMON-NEXT: # %bb.6: # %atomicrmw.start
+; RV32IB-COMMON-NEXT: # in Loop: Header=BB6_2 Depth=1
+; RV32IB-COMMON-NEXT: mv a2, s2
+; RV32IB-COMMON-NEXT: mv a3, s0
+; RV32IB-COMMON-NEXT: j .LBB6_1
+; RV32IB-COMMON-NEXT: .LBB6_7: # %atomicrmw.end
+; RV32IB-COMMON-NEXT: mv a0, a4
+; RV32IB-COMMON-NEXT: mv a1, a5
+; RV32IB-COMMON-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT: addi sp, sp, 32
+; RV32IB-COMMON-NEXT: ret
+;
+; RV64IB-ZALRSC-LABEL: atomicrmw_umax_i64_seq_cst:
+; RV64IB-ZALRSC: # %bb.0:
+; RV64IB-ZALRSC-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-NEXT: lr.d.aqrl a2, (a0)
+; RV64IB-ZALRSC-NEXT: mv a3, a2
+; RV64IB-ZALRSC-NEXT: bgeu a3, a1, .LBB6_3
+; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB6_1 Depth=1
+; RV64IB-ZALRSC-NEXT: mv a3, a1
+; RV64IB-ZALRSC-NEXT: .LBB6_3: # in Loop: Header=BB6_1 Depth=1
+; RV64IB-ZALRSC-NEXT: sc.d.rl a3, a3, (a0)
+; RV64IB-ZALRSC-NEXT: bnez a3, .LBB6_1
+; RV64IB-ZALRSC-NEXT: # %bb.4:
+; RV64IB-ZALRSC-NEXT: mv a0, a2
+; RV64IB-ZALRSC-NEXT: ret
+;
+; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_umax_i64_seq_cst:
+; RV64IB-ZALRSC-PERM: # %bb.0:
+; RV64IB-ZALRSC-PERM-NEXT: .LBB6_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-PERM-NEXT: lr.d.aqrl a2, (a0)
+; RV64IB-ZALRSC-PERM-NEXT: maxu a3, a2, a1
+; RV64IB-ZALRSC-PERM-NEXT: sc.d.rl a3, a3, (a0)
+; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB6_1
+; RV64IB-ZALRSC-PERM-NEXT: # %bb.2:
+; RV64IB-ZALRSC-PERM-NEXT: mv a0, a2
+; RV64IB-ZALRSC-PERM-NEXT: ret
+;
+; RV64IAB-LABEL: atomicrmw_umax_i64_seq_cst:
+; RV64IAB: # %bb.0:
+; RV64IAB-NEXT: amomaxu.d.aqrl a0, a1, (a0)
+; RV64IAB-NEXT: ret
+ %1 = atomicrmw umax ptr %a, i64 %b seq_cst
+ ret i64 %1
+}
+
+define i64 @atomicrmw_umin_i64_seq_cst(ptr %a, i64 %b) nounwind {
+; RV32IB-COMMON-LABEL: atomicrmw_umin_i64_seq_cst:
+; RV32IB-COMMON: # %bb.0:
+; RV32IB-COMMON-NEXT: addi sp, sp, -32
+; RV32IB-COMMON-NEXT: sw ra, 28(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT: sw s0, 24(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT: sw s1, 20(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT: sw s2, 16(sp) # 4-byte Folded Spill
+; RV32IB-COMMON-NEXT: mv s0, a2
+; RV32IB-COMMON-NEXT: mv s1, a0
+; RV32IB-COMMON-NEXT: lw a4, 0(a0)
+; RV32IB-COMMON-NEXT: lw a5, 4(a0)
+; RV32IB-COMMON-NEXT: mv s2, a1
+; RV32IB-COMMON-NEXT: j .LBB7_2
+; RV32IB-COMMON-NEXT: .LBB7_1: # %atomicrmw.start
+; RV32IB-COMMON-NEXT: # in Loop: Header=BB7_2 Depth=1
+; RV32IB-COMMON-NEXT: sw a4, 8(sp)
+; RV32IB-COMMON-NEXT: sw a5, 12(sp)
+; RV32IB-COMMON-NEXT: addi a1, sp, 8
+; RV32IB-COMMON-NEXT: li a4, 5
+; RV32IB-COMMON-NEXT: li a5, 5
+; RV32IB-COMMON-NEXT: mv a0, s1
+; RV32IB-COMMON-NEXT: call __atomic_compare_exchange_8
+; RV32IB-COMMON-NEXT: lw a4, 8(sp)
+; RV32IB-COMMON-NEXT: lw a5, 12(sp)
+; RV32IB-COMMON-NEXT: bnez a0, .LBB7_7
+; RV32IB-COMMON-NEXT: .LBB7_2: # %atomicrmw.start
+; RV32IB-COMMON-NEXT: # =>This Inner Loop Header: Depth=1
+; RV32IB-COMMON-NEXT: beq a5, s0, .LBB7_4
+; RV32IB-COMMON-NEXT: # %bb.3: # %atomicrmw.start
+; RV32IB-COMMON-NEXT: # in Loop: Header=BB7_2 Depth=1
+; RV32IB-COMMON-NEXT: sltu a0, a5, s0
+; RV32IB-COMMON-NEXT: j .LBB7_5
+; RV32IB-COMMON-NEXT: .LBB7_4: # in Loop: Header=BB7_2 Depth=1
+; RV32IB-COMMON-NEXT: sltu a0, a4, s2
+; RV32IB-COMMON-NEXT: .LBB7_5: # %atomicrmw.start
+; RV32IB-COMMON-NEXT: # in Loop: Header=BB7_2 Depth=1
+; RV32IB-COMMON-NEXT: mv a2, a4
+; RV32IB-COMMON-NEXT: mv a3, a5
+; RV32IB-COMMON-NEXT: bnez a0, .LBB7_1
+; RV32IB-COMMON-NEXT: # %bb.6: # %atomicrmw.start
+; RV32IB-COMMON-NEXT: # in Loop: Header=BB7_2 Depth=1
+; RV32IB-COMMON-NEXT: mv a2, s2
+; RV32IB-COMMON-NEXT: mv a3, s0
+; RV32IB-COMMON-NEXT: j .LBB7_1
+; RV32IB-COMMON-NEXT: .LBB7_7: # %atomicrmw.end
+; RV32IB-COMMON-NEXT: mv a0, a4
+; RV32IB-COMMON-NEXT: mv a1, a5
+; RV32IB-COMMON-NEXT: lw ra, 28(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT: lw s0, 24(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT: lw s1, 20(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT: lw s2, 16(sp) # 4-byte Folded Reload
+; RV32IB-COMMON-NEXT: addi sp, sp, 32
+; RV32IB-COMMON-NEXT: ret
+;
+; RV64IB-ZALRSC-LABEL: atomicrmw_umin_i64_seq_cst:
+; RV64IB-ZALRSC: # %bb.0:
+; RV64IB-ZALRSC-NEXT: .LBB7_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-NEXT: lr.d.aqrl a2, (a0)
+; RV64IB-ZALRSC-NEXT: mv a3, a2
+; RV64IB-ZALRSC-NEXT: bgeu a1, a3, .LBB7_3
+; RV64IB-ZALRSC-NEXT: # %bb.2: # in Loop: Header=BB7_1 Depth=1
+; RV64IB-ZALRSC-NEXT: mv a3, a1
+; RV64IB-ZALRSC-NEXT: .LBB7_3: # in Loop: Header=BB7_1 Depth=1
+; RV64IB-ZALRSC-NEXT: sc.d.rl a3, a3, (a0)
+; RV64IB-ZALRSC-NEXT: bnez a3, .LBB7_1
+; RV64IB-ZALRSC-NEXT: # %bb.4:
+; RV64IB-ZALRSC-NEXT: mv a0, a2
+; RV64IB-ZALRSC-NEXT: ret
+;
+; RV64IB-ZALRSC-PERM-LABEL: atomicrmw_umin_i64_seq_cst:
+; RV64IB-ZALRSC-PERM: # %bb.0:
+; RV64IB-ZALRSC-PERM-NEXT: .LBB7_1: # =>This Inner Loop Header: Depth=1
+; RV64IB-ZALRSC-PERM-NEXT: lr.d.aqrl a2, (a0)
+; RV64IB-ZALRSC-PERM-NEXT: minu a3, a2, a1
+; RV64IB-ZALRSC-PERM-NEXT: sc.d.rl a3, a3, (a0)
+; RV64IB-ZALRSC-PERM-NEXT: bnez a3, .LBB7_1
+; RV64IB-ZALRSC-PERM-NEXT: # %bb.2:
+; RV64IB-ZALRSC-PERM-NEXT: mv a0, a2
+; RV64IB-ZALRSC-PERM-NEXT: ret
+;
+; RV64IAB-LABEL: atomicrmw_umin_i64_seq_cst:
+; RV64IAB: # %bb.0:
+; RV64IAB-NEXT: amominu.d.aqrl a0, a1, (a0)
+; RV64IAB-NEXT: ret
+ %1 = atomicrmw umin ptr %a, i64 %b seq_cst
+ ret i64 %1
+}
diff --git a/llvm/test/CodeGen/RISCV/features-info.ll b/llvm/test/CodeGen/RISCV/features-info.ll
index 5e5f2b7..37e11db 100644
--- a/llvm/test/CodeGen/RISCV/features-info.ll
+++ b/llvm/test/CodeGen/RISCV/features-info.ll
@@ -81,6 +81,7 @@
; CHECK-NEXT: optimized-nf7-segment-load-store - vlseg7eN.v and vsseg7eN.v are implemented as a wide memory op and shuffle.
; CHECK-NEXT: optimized-nf8-segment-load-store - vlseg8eN.v and vsseg8eN.v are implemented as a wide memory op and shuffle.
; CHECK-NEXT: optimized-zero-stride-load - Optimized (perform fewer memory operations)zero-stride vector load.
+; CHECK-NEXT: permissive-zalrsc - Implementation permits non-base instructions between LR/SC pairs.
; CHECK-NEXT: predictable-select-expensive - Prefer likely predicted branches over selects.
; CHECK-NEXT: prefer-vsetvli-over-read-vlenb - Prefer vsetvli over read vlenb CSR to calculate VLEN.
; CHECK-NEXT: prefer-w-inst - Prefer instructions with W suffix.
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveMin.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveMin.ll
new file mode 100644
index 0000000..d121c1a
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveActiveMin.ll
@@ -0,0 +1,57 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-vulkan-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-vulkan-unknown %s -o - -filetype=obj | spirv-val %}
+
+; Test lowering to spir-v backend for various types and scalar/vector
+
+; CHECK: OpCapability GroupNonUniformArithmetic
+
+; CHECK-DAG: %[[#f16:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#f32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#uint:]] = OpTypeInt 32 0
+; CHECK-DAG: %[[#v4_half:]] = OpTypeVector %[[#f16]] 4
+; CHECK-DAG: %[[#scope:]] = OpConstant %[[#uint]] 3
+
+; CHECK-LABEL: Begin function test_float
+; CHECK: %[[#fexpr:]] = OpFunctionParameter %[[#f32]]
+define float @test_float(float %fexpr) {
+entry:
+; CHECK: %[[#fret:]] = OpGroupNonUniformFMin %[[#f32]] %[[#scope]] Reduce %[[#fexpr]]
+ %0 = call float @llvm.spv.wave.reduce.min.f32(float %fexpr)
+ ret float %0
+}
+
+; CHECK-LABEL: Begin function test_int_signed
+; CHECK: %[[#iexpr:]] = OpFunctionParameter %[[#uint]]
+define i32 @test_int_signed(i32 %iexpr) {
+entry:
+; CHECK: %[[#iret:]] = OpGroupNonUniformSMin %[[#uint]] %[[#scope]] Reduce %[[#iexpr]]
+ %0 = call i32 @llvm.spv.wave.reduce.min.i32(i32 %iexpr)
+ ret i32 %0
+}
+
+; CHECK-LABEL: Begin function test_int_unsigned
+; CHECK: %[[#iexpr:]] = OpFunctionParameter %[[#uint]]
+define i32 @test_int_unsigned(i32 %iexpr) {
+entry:
+; CHECK: %[[#iret:]] = OpGroupNonUniformUMin %[[#uint]] %[[#scope]] Reduce %[[#iexpr]]
+ %0 = call i32 @llvm.spv.wave.reduce.umin.i32(i32 %iexpr)
+ ret i32 %0
+}
+
+; CHECK-LABEL: Begin function test_vhalf
+; CHECK: %[[#vbexpr:]] = OpFunctionParameter %[[#v4_half]]
+define <4 x half> @test_vhalf(<4 x half> %vbexpr) {
+entry:
+; CHECK: %[[#vhalfret:]] = OpGroupNonUniformFMin %[[#v4_half]] %[[#scope]] Reduce %[[#vbexpr]]
+ %0 = call <4 x half> @llvm.spv.wave.reduce.min.v4half(<4 x half> %vbexpr)
+ ret <4 x half> %0
+}
+
+declare float @llvm.spv.wave.reduce.min.f32(float)
+declare i32 @llvm.spv.wave.reduce.min.i32(i32)
+declare <4 x half> @llvm.spv.wave.reduce.min.v4half(<4 x half>)
+
+declare float @llvm.spv.wave.reduce.umin.f32(float)
+declare i32 @llvm.spv.wave.reduce.umin.i32(i32)
+declare <4 x half> @llvm.spv.wave.reduce.umin.v4half(<4 x half>)
+