aboutsummaryrefslogtreecommitdiff
path: root/llvm/test/CodeGen/AMDGPU
diff options
context:
space:
mode:
authorVitaly Buka <vitalybuka@google.com>2024-04-02 14:23:42 -0700
committerVitaly Buka <vitalybuka@google.com>2024-04-02 14:23:42 -0700
commit2fe88fc8b7a3c27d473b6a172f0dc8aae7be3310 (patch)
tree4a2ce5eb31e8242dcbb7d7a3de82d3309fdc23c5 /llvm/test/CodeGen/AMDGPU
parenteb6a41808ef4e058a24f9ebc6c85b10c966eb183 (diff)
parent89271b46761749503dffe94c60b9cbe0bda80284 (diff)
downloadllvm-2fe88fc8b7a3c27d473b6a172f0dc8aae7be3310.zip
llvm-2fe88fc8b7a3c27d473b6a172f0dc8aae7be3310.tar.gz
llvm-2fe88fc8b7a3c27d473b6a172f0dc8aae7be3310.tar.bz2
[𝘀𝗽𝗿] changes introduced through rebase
Created using spr 1.3.4 [skip ci]
Diffstat (limited to 'llvm/test/CodeGen/AMDGPU')
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll274
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomic-cmpxchg-with-success.mir15
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir15
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir45
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir15
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir45
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap.mir4
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll41
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll232
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll147
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll1063
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll408
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll229
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/add_sub_u64_pseudos.mir68
-rw-r--r--llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/allow-check.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll255
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll51
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll108
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll12
-rw-r--r--llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll44
-rw-r--r--llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll26
-rw-r--r--llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll19
-rw-r--r--llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/bf16-conversions.ll357
-rw-r--r--llvm/test/CodeGen/AMDGPU/bf16.ll903
-rw-r--r--llvm/test/CodeGen/AMDGPU/clamp.ll64
-rw-r--r--llvm/test/CodeGen/AMDGPU/convergence-tokens.ll55
-rw-r--r--llvm/test/CodeGen/AMDGPU/copy-vgpr-clobber-spill-vgpr.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/div_i128.ll2298
-rw-r--r--llvm/test/CodeGen/AMDGPU/div_v2i128.ll3233
-rw-r--r--llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll103
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll728
-rw-r--r--llvm/test/CodeGen/AMDGPU/fcanonicalize.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll22
-rw-r--r--llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp-classify.ll60
-rw-r--r--llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll270
-rw-r--r--llvm/test/CodeGen/AMDGPU/fptoi.i128.ll1502
-rw-r--r--llvm/test/CodeGen/AMDGPU/fract-match.ll167
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll867
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll564
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll564
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll669
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll5578
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll3960
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll3960
-rw-r--r--llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll5576
-rw-r--r--llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir1
-rw-r--r--llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/itofp.i128.ll1618
-rw-r--r--llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-mixed-absolute-addresses-unused.ll26
-rw-r--r--llvm/test/CodeGen/AMDGPU/lds-reject-mixed-absolute-addresses.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llc-pipeline.ll10
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll146
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll146
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll111
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll30
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll333
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll317
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll280
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll122
-rw-r--r--llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll122
-rw-r--r--llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll47
-rw-r--r--llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll191
-rw-r--r--llvm/test/CodeGen/AMDGPU/merge-buffer-gfx12.mir1154
-rw-r--r--llvm/test/CodeGen/AMDGPU/merge-buffer.mir1130
-rw-r--r--llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir28
-rw-r--r--llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir504
-rw-r--r--llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll305
-rw-r--r--llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll11
-rw-r--r--llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll20
-rw-r--r--llvm/test/CodeGen/AMDGPU/preload-kernargs.ll8976
-rw-r--r--llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll69
-rw-r--r--llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll18
-rw-r--r--llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll44
-rw-r--r--llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir1
-rw-r--r--llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll8
-rw-r--r--llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir2
-rw-r--r--llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir1
-rw-r--r--llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll15
-rw-r--r--llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir1
-rw-r--r--llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll78
-rw-r--r--llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll6
-rw-r--r--llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll4
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll26
-rw-r--r--llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir1
-rw-r--r--llvm/test/CodeGen/AMDGPU/wave32.ll34
-rw-r--r--llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll1
-rw-r--r--llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll2
-rw-r--r--llvm/test/CodeGen/AMDGPU/wwm-reserved.ll4
117 files changed, 36973 insertions, 13816 deletions
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
index 255c6de..1a76f8c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fp64-atomics-gfx90a.ll
@@ -1090,18 +1090,29 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat:
; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_mov_b64 s[2:3], exec
+; GFX90A-NEXT: s_mov_b32 s4, s3
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_cbranch_execz .LBB39_3
+; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: .LBB39_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB39_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
+; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
@@ -1109,20 +1120,31 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: s_cbranch_execnz .LBB39_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_cbranch_execnz .LBB39_2
+; GFX90A-NEXT: .LBB39_3:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat:
; GFX940: ; %bb.0: ; %main_body
+; GFX940-NEXT: s_mov_b64 s[2:3], exec
+; GFX940-NEXT: s_mov_b32 s4, s3
+; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT: s_cbranch_execz .LBB39_2
+; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
+; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: .LBB39_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst
@@ -1132,26 +1154,47 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(1) %ptr) #1 {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent:
; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_mov_b64 s[2:3], exec
+; GFX90A-NEXT: s_mov_b32 s4, s3
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_cbranch_execz .LBB40_2
+; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: .LBB40_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent:
; GFX940: ; %bb.0: ; %main_body
+; GFX940-NEXT: s_mov_b64 s[2:3], exec
+; GFX940-NEXT: s_mov_b32 s4, s3
+; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT: s_cbranch_execz .LBB40_2
+; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
+; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: .LBB40_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
@@ -1161,18 +1204,29 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace(1) %ptr) #1 {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system:
; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_mov_b64 s[2:3], exec
+; GFX90A-NEXT: s_mov_b32 s4, s3
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_cbranch_execz .LBB41_3
+; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: .LBB41_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB41_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
+; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
@@ -1180,20 +1234,31 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: s_cbranch_execnz .LBB41_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_cbranch_execnz .LBB41_2
+; GFX90A-NEXT: .LBB41_3:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system:
; GFX940: ; %bb.0: ; %main_body
+; GFX940-NEXT: s_mov_b64 s[2:3], exec
+; GFX940-NEXT: s_mov_b32 s4, s3
+; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT: s_cbranch_execz .LBB41_2
+; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
+; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: .LBB41_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst
@@ -1203,26 +1268,47 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(1) %ptr) #0 {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush:
; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_mov_b64 s[2:3], exec
+; GFX90A-NEXT: s_mov_b32 s4, s3
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_cbranch_execz .LBB42_2
+; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: .LBB42_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush:
; GFX940: ; %bb.0: ; %main_body
+; GFX940-NEXT: s_mov_b64 s[2:3], exec
+; GFX940-NEXT: s_mov_b32 s4, s3
+; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT: s_cbranch_execz .LBB42_2
+; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
+; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: .LBB42_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
@@ -1394,37 +1480,59 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_mov_b64 s[2:3], exec
+; GFX90A-NEXT: s_mov_b32 s4, s3
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_cbranch_execz .LBB49_3
+; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: .LBB49_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB49_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: s_cbranch_execnz .LBB49_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_cbranch_execnz .LBB49_2
+; GFX90A-NEXT: .LBB49_3:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
; GFX940: ; %bb.0: ; %main_body
+; GFX940-NEXT: s_mov_b64 s[2:3], exec
+; GFX940-NEXT: s_mov_b32 s4, s3
+; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT: s_cbranch_execz .LBB49_2
+; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
+; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
+; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: v_mov_b32_e32 v2, 0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: .LBB49_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
@@ -1866,23 +1974,44 @@ main_body:
define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr) #1 {
; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat:
; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_mov_b64 s[2:3], exec
+; GFX90A-NEXT: s_mov_b32 s4, s3
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_cbranch_execz .LBB65_2
+; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
+; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: .LBB65_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat:
; GFX940: ; %bb.0: ; %main_body
+; GFX940-NEXT: s_mov_b64 s[2:3], exec
+; GFX940-NEXT: s_mov_b32 s4, s3
+; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT: s_cbranch_execz .LBB65_2
+; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
+; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
+; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NEXT: ds_add_f64 v2, v[0:1]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: .LBB65_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
@@ -1892,23 +2021,44 @@ main_body:
define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3) %ptr) #0 {
; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush:
; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_mov_b64 s[2:3], exec
+; GFX90A-NEXT: s_mov_b32 s4, s3
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_cbranch_execz .LBB66_2
+; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
+; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: .LBB66_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush:
; GFX940: ; %bb.0: ; %main_body
+; GFX940-NEXT: s_mov_b64 s[2:3], exec
+; GFX940-NEXT: s_mov_b32 s4, s3
+; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT: s_cbranch_execz .LBB66_2
+; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
+; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
+; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NEXT: ds_add_f64 v2, v[0:1]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: .LBB66_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
@@ -1918,44 +2068,66 @@ main_body:
define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #4 {
; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_mov_b64 s[2:3], exec
+; GFX90A-NEXT: s_mov_b32 s4, s3
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_cbranch_execz .LBB67_3
+; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
+; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v2, s0
-; GFX90A-NEXT: ds_read_b64 v[0:1], v2
+; GFX90A-NEXT: v_mov_b32_e32 v4, s0
+; GFX90A-NEXT: ds_read_b64 v[2:3], v4
; GFX90A-NEXT: s_mov_b64 s[0:1], 0
-; GFX90A-NEXT: .LBB67_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB67_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_add_f64 v[4:5], v[0:1], 4.0
-; GFX90A-NEXT: ds_cmpst_rtn_b64 v[4:5], v2, v[0:1], v[4:5]
+; GFX90A-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1]
+; GFX90A-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[0:1]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX90A-NEXT: s_cbranch_execnz .LBB67_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_cbranch_execnz .LBB67_2
+; GFX90A-NEXT: .LBB67_3:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
; GFX940: ; %bb.0: ; %main_body
+; GFX940-NEXT: s_mov_b64 s[2:3], exec
+; GFX940-NEXT: s_mov_b32 s4, s3
+; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s4, v0
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT: s_cbranch_execz .LBB67_3
+; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
+; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v2, s0
-; GFX940-NEXT: ds_read_b64 v[0:1], v2
+; GFX940-NEXT: v_mov_b32_e32 v4, s0
+; GFX940-NEXT: ds_read_b64 v[2:3], v4
; GFX940-NEXT: s_mov_b64 s[0:1], 0
-; GFX940-NEXT: .LBB67_1: ; %atomicrmw.start
+; GFX940-NEXT: .LBB67_2: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_add_f64 v[4:5], v[0:1], 4.0
-; GFX940-NEXT: ds_cmpst_rtn_b64 v[4:5], v2, v[0:1], v[4:5]
+; GFX940-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1]
+; GFX940-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[0:1]
+; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[6:7]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB67_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_cbranch_execnz .LBB67_2
+; GFX940-NEXT: .LBB67_3:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomic-cmpxchg-with-success.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomic-cmpxchg-with-success.mir
index e288d9d..eafd1e1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomic-cmpxchg-with-success.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-atomic-cmpxchg-with-success.mir
@@ -16,7 +16,8 @@ body: |
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY1]](s32)
; CHECK-NEXT: [[AMDGPU_ATOMIC_CMPXCHG:%[0-9]+]]:_(s32) = G_AMDGPU_ATOMIC_CMPXCHG [[COPY]](p1), [[BUILD_VECTOR]] :: (load store syncscope("agent-one-as") monotonic monotonic (s32), addrspace 1)
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AMDGPU_ATOMIC_CMPXCHG]](s32), [[COPY1]]
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[AMDGPU_ATOMIC_CMPXCHG]](s32), implicit [[ICMP]](s1)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AMDGPU_ATOMIC_CMPXCHG]](s32)
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY3]](s32), implicit [[ICMP]](s1)
%0:_(p1) = COPY $vgpr0_vgpr1
%1:_(s32) = COPY $vgpr2
%2:_(s32) = COPY $vgpr3
@@ -40,7 +41,8 @@ body: |
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[COPY2]](s32), [[COPY1]](s32)
; CHECK-NEXT: [[AMDGPU_ATOMIC_CMPXCHG:%[0-9]+]]:_(s32) = G_AMDGPU_ATOMIC_CMPXCHG [[COPY]](p0), [[BUILD_VECTOR]] :: (load store syncscope("agent-one-as") monotonic monotonic (s32))
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AMDGPU_ATOMIC_CMPXCHG]](s32), [[COPY1]]
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[AMDGPU_ATOMIC_CMPXCHG]](s32), implicit [[ICMP]](s1)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[AMDGPU_ATOMIC_CMPXCHG]](s32)
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY3]](s32), implicit [[ICMP]](s1)
%0:_(p0) = COPY $vgpr0_vgpr1
%1:_(s32) = COPY $vgpr2
%2:_(s32) = COPY $vgpr3
@@ -63,7 +65,8 @@ body: |
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY $vgpr2
; CHECK-NEXT: [[ATOMIC_CMPXCHG:%[0-9]+]]:_(s32) = G_ATOMIC_CMPXCHG [[COPY]](p3), [[COPY1]], [[COPY2]] :: (load store syncscope("agent-one-as") monotonic monotonic (s32), addrspace 3)
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ATOMIC_CMPXCHG]](s32), [[COPY1]]
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[ATOMIC_CMPXCHG]](s32), implicit [[ICMP]](s1)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s32) = COPY [[ATOMIC_CMPXCHG]](s32)
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY3]](s32), implicit [[ICMP]](s1)
%0:_(p3) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
%2:_(s32) = COPY $vgpr2
@@ -87,7 +90,8 @@ body: |
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[COPY2]](s64), [[COPY1]](s64)
; CHECK-NEXT: [[AMDGPU_ATOMIC_CMPXCHG:%[0-9]+]]:_(s64) = G_AMDGPU_ATOMIC_CMPXCHG [[COPY]](p1), [[BUILD_VECTOR]] :: (load store syncscope("agent-one-as") monotonic monotonic (s64), addrspace 1)
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[AMDGPU_ATOMIC_CMPXCHG]](s64), [[COPY1]]
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[AMDGPU_ATOMIC_CMPXCHG]](s64), implicit [[ICMP]](s1)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[AMDGPU_ATOMIC_CMPXCHG]](s64)
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY3]](s64), implicit [[ICMP]](s1)
%0:_(p1) = COPY $vgpr0_vgpr1
%1:_(s64) = COPY $vgpr2_vgpr3
%2:_(s64) = COPY $vgpr4_vgpr5
@@ -110,7 +114,8 @@ body: |
; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY $vgpr3_vgpr4
; CHECK-NEXT: [[ATOMIC_CMPXCHG:%[0-9]+]]:_(s64) = G_ATOMIC_CMPXCHG [[COPY]](p3), [[COPY1]], [[COPY2]] :: (load store syncscope("agent-one-as") monotonic monotonic (s64), addrspace 3)
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(eq), [[ATOMIC_CMPXCHG]](s64), [[COPY1]]
- ; CHECK-NEXT: S_ENDPGM 0, implicit [[ATOMIC_CMPXCHG]](s64), implicit [[ICMP]](s1)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[ATOMIC_CMPXCHG]](s64)
+ ; CHECK-NEXT: S_ENDPGM 0, implicit [[COPY3]](s64), implicit [[ICMP]](s1)
%0:_(p3) = COPY $vgpr0
%1:_(s64) = COPY $vgpr1_vgpr2
%2:_(s64) = COPY $vgpr3_vgpr4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
index e9f8180..fed277d 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ctlz-zero-undef.mir
@@ -64,9 +64,7 @@ body: |
; CHECK-NEXT: {{ $}}
; CHECK-NEXT: [[COPY:%[0-9]+]]:_(s32) = COPY $vgpr0
; CHECK-NEXT: [[CTLZ_ZERO_UNDEF:%[0-9]+]]:_(s32) = G_CTLZ_ZERO_UNDEF [[COPY]](s32)
- ; CHECK-NEXT: [[C:%[0-9]+]]:_(s32) = G_CONSTANT i32 65535
- ; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[CTLZ_ZERO_UNDEF]], [[C]]
- ; CHECK-NEXT: $vgpr0 = COPY [[AND]](s32)
+ ; CHECK-NEXT: $vgpr0 = COPY [[CTLZ_ZERO_UNDEF]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s16) = G_CTLZ_ZERO_UNDEF %0
%2:_(s32) = G_ZEXT %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir
index dba20e1..eb86a98 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddo.mir
@@ -86,8 +86,9 @@ body: |
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[ADD]](s32), [[COPY]]
; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY1]](s32), [[C]]
; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[ADD]](s32)
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[XOR]](s1)
- ; CHECK-NEXT: $vgpr0 = COPY [[ADD]](s32)
+ ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[ZEXT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
@@ -117,8 +118,9 @@ body: |
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]]
; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY1]](s64), [[C]]
; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64)
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[XOR]](s1)
- ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+ ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](s64)
; CHECK-NEXT: $vgpr2 = COPY [[ZEXT]](s32)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = COPY $vgpr2_vgpr3
@@ -172,11 +174,12 @@ body: |
; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP1]]
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1)
; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY [[BITCAST2]](<2 x s16>)
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]]
; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]]
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND2]](s32), [[AND3]](s32)
- ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+ ; CHECK-NEXT: $vgpr0 = COPY [[COPY3]](<2 x s16>)
; CHECK-NEXT: $vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<2 x s32>)
%0:_(<2 x s16>) = COPY $vgpr0
%1:_(<2 x s16>) = COPY $vgpr1
@@ -360,13 +363,14 @@ body: |
; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1)
; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR2]](s1)
; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR3]](s1)
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(<4 x s16>) = COPY [[CONCAT_VECTORS]](<4 x s16>)
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]]
; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]]
; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C3]]
; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C3]]
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[AND4]](s32), [[AND5]](s32), [[AND6]](s32), [[AND7]](s32)
- ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+ ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[COPY5]](<4 x s16>)
; CHECK-NEXT: $vgpr2_vgpr3_vgpr4_vgpr5 = COPY [[BUILD_VECTOR]](<4 x s32>)
%0:_(<4 x s16>) = COPY $vgpr0_vgpr1
%1:_(<4 x s16>) = COPY $vgpr1_vgpr2
@@ -403,11 +407,12 @@ body: |
; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP1]]
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1)
; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s32>) = COPY [[BUILD_VECTOR]](<2 x s32>)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C1]]
; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C1]]
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32)
- ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](<2 x s32>)
; CHECK-NEXT: $vgpr2_vgpr3 = COPY [[BUILD_VECTOR1]](<2 x s32>)
%0:_(<2 x s32>) = COPY $vgpr0_vgpr1
%1:_(<2 x s32>) = COPY $vgpr2_vgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir
index 93d0071..80b3166 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-saddsat.mir
@@ -955,15 +955,16 @@ body: |
; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]]
; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY1]](s64), [[C]]
; GFX6-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]]
+ ; GFX6-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64)
; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63
- ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32)
+ ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32)
; GFX6-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808
; GFX6-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64)
; GFX6-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64)
; GFX6-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]]
; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO3]]
; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32)
- ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]]
+ ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]]
; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
;
; GFX8-LABEL: name: saddsat_s64
@@ -980,15 +981,16 @@ body: |
; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]]
; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY1]](s64), [[C]]
; GFX8-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]]
+ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64)
; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63
- ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32)
+ ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32)
; GFX8-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808
; GFX8-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64)
; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64)
; GFX8-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]]
; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO3]]
; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32)
- ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]]
+ ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]]
; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
;
; GFX9-LABEL: name: saddsat_s64
@@ -1005,15 +1007,16 @@ body: |
; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]]
; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[COPY1]](s64), [[C]]
; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]]
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64)
; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63
- ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32)
+ ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32)
; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808
; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64)
; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64)
; GFX9-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]]
; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO3]]
; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32)
- ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]]
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]]
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = COPY $vgpr2_vgpr3
@@ -1043,15 +1046,16 @@ body: |
; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]]
; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV2]](s64), [[C]]
; GFX6-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]]
+ ; GFX6-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64)
; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63
- ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32)
+ ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32)
; GFX6-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808
; GFX6-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64)
; GFX6-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64)
; GFX6-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]]
; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO3]]
; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32)
- ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]]
+ ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]]
; GFX6-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
; GFX6-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
; GFX6-NEXT: [[UADDO4:%[0-9]+]]:_(s32), [[UADDO5:%[0-9]+]]:_(s1) = G_UADDO [[UV12]], [[UV14]]
@@ -1060,13 +1064,14 @@ body: |
; GFX6-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]]
; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV3]](s64), [[C]]
; GFX6-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]]
- ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[MV2]], [[C1]](s32)
+ ; GFX6-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64)
+ ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32)
; GFX6-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64)
; GFX6-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64)
; GFX6-NEXT: [[UADDO6:%[0-9]+]]:_(s32), [[UADDO7:%[0-9]+]]:_(s1) = G_UADDO [[UV16]], [[UV18]]
; GFX6-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UV17]], [[UV19]], [[UADDO7]]
; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO6]](s32), [[UADDE6]](s32)
- ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]]
+ ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[COPY3]]
; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64)
; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
;
@@ -1086,15 +1091,16 @@ body: |
; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]]
; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV2]](s64), [[C]]
; GFX8-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]]
+ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64)
; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63
- ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32)
+ ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32)
; GFX8-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808
; GFX8-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64)
; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64)
; GFX8-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]]
; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO3]]
; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32)
- ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]]
+ ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]]
; GFX8-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
; GFX8-NEXT: [[UADDO4:%[0-9]+]]:_(s32), [[UADDO5:%[0-9]+]]:_(s1) = G_UADDO [[UV12]], [[UV14]]
@@ -1103,13 +1109,14 @@ body: |
; GFX8-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]]
; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV3]](s64), [[C]]
; GFX8-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]]
- ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[MV2]], [[C1]](s32)
+ ; GFX8-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64)
+ ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32)
; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64)
; GFX8-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64)
; GFX8-NEXT: [[UADDO6:%[0-9]+]]:_(s32), [[UADDO7:%[0-9]+]]:_(s1) = G_UADDO [[UV16]], [[UV18]]
; GFX8-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UV17]], [[UV19]], [[UADDO7]]
; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO6]](s32), [[UADDE6]](s32)
- ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]]
+ ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[COPY3]]
; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64)
; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
;
@@ -1129,15 +1136,16 @@ body: |
; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]]
; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV2]](s64), [[C]]
; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]]
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64)
; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63
- ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32)
+ ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32)
; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808
; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64)
; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64)
; GFX9-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]]
; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO3]]
; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32)
- ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]]
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]]
; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
; GFX9-NEXT: [[UADDO4:%[0-9]+]]:_(s32), [[UADDO5:%[0-9]+]]:_(s1) = G_UADDO [[UV12]], [[UV14]]
@@ -1146,13 +1154,14 @@ body: |
; GFX9-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]]
; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[UV3]](s64), [[C]]
; GFX9-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]]
- ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[MV2]], [[C1]](s32)
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64)
+ ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32)
; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64)
; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64)
; GFX9-NEXT: [[UADDO6:%[0-9]+]]:_(s32), [[UADDO7:%[0-9]+]]:_(s1) = G_UADDO [[UV16]], [[UV18]]
; GFX9-NEXT: [[UADDE6:%[0-9]+]]:_(s32), [[UADDE7:%[0-9]+]]:_(s1) = G_UADDE [[UV17]], [[UV19]], [[UADDO7]]
; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO6]](s32), [[UADDE6]](s32)
- ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]]
+ ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[COPY3]]
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64)
; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
%0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir
index 57b1ab9..220450c 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubo.mir
@@ -86,8 +86,9 @@ body: |
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[SUB]](s32), [[COPY]]
; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY1]](s32), [[C]]
; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s32) = COPY [[SUB]](s32)
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[XOR]](s1)
- ; CHECK-NEXT: $vgpr0 = COPY [[SUB]](s32)
+ ; CHECK-NEXT: $vgpr0 = COPY [[COPY2]](s32)
; CHECK-NEXT: $vgpr1 = COPY [[ZEXT]](s32)
%0:_(s32) = COPY $vgpr0
%1:_(s32) = COPY $vgpr1
@@ -117,8 +118,9 @@ body: |
; CHECK-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]]
; CHECK-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY1]](s64), [[C]]
; CHECK-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]]
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64)
; CHECK-NEXT: [[ZEXT:%[0-9]+]]:_(s32) = G_ZEXT [[XOR]](s1)
- ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[MV]](s64)
+ ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](s64)
; CHECK-NEXT: $vgpr2 = COPY [[ZEXT]](s32)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = COPY $vgpr2_vgpr3
@@ -172,11 +174,12 @@ body: |
; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP1]]
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1)
; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1)
+ ; CHECK-NEXT: [[COPY3:%[0-9]+]]:_(<2 x s16>) = COPY [[BITCAST2]](<2 x s16>)
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; CHECK-NEXT: [[AND2:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]]
; CHECK-NEXT: [[AND3:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]]
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND2]](s32), [[AND3]](s32)
- ; CHECK-NEXT: $vgpr0 = COPY [[BITCAST2]](<2 x s16>)
+ ; CHECK-NEXT: $vgpr0 = COPY [[COPY3]](<2 x s16>)
; CHECK-NEXT: $vgpr1_vgpr2 = COPY [[BUILD_VECTOR]](<2 x s32>)
%0:_(<2 x s16>) = COPY $vgpr0
%1:_(<2 x s16>) = COPY $vgpr1
@@ -360,13 +363,14 @@ body: |
; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1)
; CHECK-NEXT: [[ANYEXT2:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR2]](s1)
; CHECK-NEXT: [[ANYEXT3:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR3]](s1)
+ ; CHECK-NEXT: [[COPY5:%[0-9]+]]:_(<4 x s16>) = COPY [[CONCAT_VECTORS]](<4 x s16>)
; CHECK-NEXT: [[C3:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; CHECK-NEXT: [[AND4:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C3]]
; CHECK-NEXT: [[AND5:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C3]]
; CHECK-NEXT: [[AND6:%[0-9]+]]:_(s32) = G_AND [[ANYEXT2]], [[C3]]
; CHECK-NEXT: [[AND7:%[0-9]+]]:_(s32) = G_AND [[ANYEXT3]], [[C3]]
; CHECK-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<4 x s32>) = G_BUILD_VECTOR [[AND4]](s32), [[AND5]](s32), [[AND6]](s32), [[AND7]](s32)
- ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[CONCAT_VECTORS]](<4 x s16>)
+ ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[COPY5]](<4 x s16>)
; CHECK-NEXT: $vgpr2_vgpr3_vgpr4_vgpr5 = COPY [[BUILD_VECTOR]](<4 x s32>)
%0:_(<4 x s16>) = COPY $vgpr0_vgpr1
%1:_(<4 x s16>) = COPY $vgpr1_vgpr2
@@ -403,11 +407,12 @@ body: |
; CHECK-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP1]]
; CHECK-NEXT: [[ANYEXT:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR]](s1)
; CHECK-NEXT: [[ANYEXT1:%[0-9]+]]:_(s32) = G_ANYEXT [[XOR1]](s1)
+ ; CHECK-NEXT: [[COPY2:%[0-9]+]]:_(<2 x s32>) = COPY [[BUILD_VECTOR]](<2 x s32>)
; CHECK-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 1
; CHECK-NEXT: [[AND:%[0-9]+]]:_(s32) = G_AND [[ANYEXT]], [[C1]]
; CHECK-NEXT: [[AND1:%[0-9]+]]:_(s32) = G_AND [[ANYEXT1]], [[C1]]
; CHECK-NEXT: [[BUILD_VECTOR1:%[0-9]+]]:_(<2 x s32>) = G_BUILD_VECTOR [[AND]](s32), [[AND1]](s32)
- ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[BUILD_VECTOR]](<2 x s32>)
+ ; CHECK-NEXT: $vgpr0_vgpr1 = COPY [[COPY2]](<2 x s32>)
; CHECK-NEXT: $vgpr2_vgpr3 = COPY [[BUILD_VECTOR1]](<2 x s32>)
%0:_(<2 x s32>) = COPY $vgpr0_vgpr1
%1:_(<2 x s32>) = COPY $vgpr2_vgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir
index 33a8cda..49fb6e9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-ssubsat.mir
@@ -955,15 +955,16 @@ body: |
; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]]
; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY1]](s64), [[C]]
; GFX6-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]]
+ ; GFX6-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64)
; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63
- ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32)
+ ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32)
; GFX6-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808
; GFX6-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64)
; GFX6-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64)
; GFX6-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]]
; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO1]]
; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32)
- ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]]
+ ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]]
; GFX6-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
;
; GFX8-LABEL: name: ssubsat_s64
@@ -980,15 +981,16 @@ body: |
; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]]
; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY1]](s64), [[C]]
; GFX8-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]]
+ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64)
; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63
- ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32)
+ ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32)
; GFX8-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808
; GFX8-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64)
; GFX8-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64)
; GFX8-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]]
; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO1]]
; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32)
- ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]]
+ ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]]
; GFX8-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
;
; GFX9-LABEL: name: ssubsat_s64
@@ -1005,15 +1007,16 @@ body: |
; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[COPY]]
; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[COPY1]](s64), [[C]]
; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]]
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64)
; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63
- ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32)
+ ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32)
; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808
; GFX9-NEXT: [[UV4:%[0-9]+]]:_(s32), [[UV5:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64)
; GFX9-NEXT: [[UV6:%[0-9]+]]:_(s32), [[UV7:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64)
; GFX9-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV4]], [[UV6]]
; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV5]], [[UV7]], [[UADDO1]]
; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32)
- ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]]
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]]
; GFX9-NEXT: $vgpr0_vgpr1 = COPY [[SELECT]](s64)
%0:_(s64) = COPY $vgpr0_vgpr1
%1:_(s64) = COPY $vgpr2_vgpr3
@@ -1043,15 +1046,16 @@ body: |
; GFX6-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]]
; GFX6-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV2]](s64), [[C]]
; GFX6-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]]
+ ; GFX6-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64)
; GFX6-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63
- ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32)
+ ; GFX6-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32)
; GFX6-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808
; GFX6-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64)
; GFX6-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64)
; GFX6-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]]
; GFX6-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO1]]
; GFX6-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32)
- ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]]
+ ; GFX6-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]]
; GFX6-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
; GFX6-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
; GFX6-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV12]], [[UV14]]
@@ -1060,13 +1064,14 @@ body: |
; GFX6-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]]
; GFX6-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV3]](s64), [[C]]
; GFX6-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]]
- ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[MV2]], [[C1]](s32)
+ ; GFX6-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64)
+ ; GFX6-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32)
; GFX6-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64)
; GFX6-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64)
; GFX6-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV16]], [[UV18]]
; GFX6-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV17]], [[UV19]], [[UADDO3]]
; GFX6-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32)
- ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]]
+ ; GFX6-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[COPY3]]
; GFX6-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64)
; GFX6-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
;
@@ -1086,15 +1091,16 @@ body: |
; GFX8-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]]
; GFX8-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV2]](s64), [[C]]
; GFX8-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]]
+ ; GFX8-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64)
; GFX8-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63
- ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32)
+ ; GFX8-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32)
; GFX8-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808
; GFX8-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64)
; GFX8-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64)
; GFX8-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]]
; GFX8-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO1]]
; GFX8-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32)
- ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]]
+ ; GFX8-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]]
; GFX8-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
; GFX8-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
; GFX8-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV12]], [[UV14]]
@@ -1103,13 +1109,14 @@ body: |
; GFX8-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]]
; GFX8-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV3]](s64), [[C]]
; GFX8-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]]
- ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[MV2]], [[C1]](s32)
+ ; GFX8-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64)
+ ; GFX8-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32)
; GFX8-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64)
; GFX8-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64)
; GFX8-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV16]], [[UV18]]
; GFX8-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV17]], [[UV19]], [[UADDO3]]
; GFX8-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32)
- ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]]
+ ; GFX8-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[COPY3]]
; GFX8-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64)
; GFX8-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
;
@@ -1129,15 +1136,16 @@ body: |
; GFX9-NEXT: [[ICMP:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV]](s64), [[UV]]
; GFX9-NEXT: [[ICMP1:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV2]](s64), [[C]]
; GFX9-NEXT: [[XOR:%[0-9]+]]:_(s1) = G_XOR [[ICMP1]], [[ICMP]]
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:_(s64) = COPY [[MV]](s64)
; GFX9-NEXT: [[C1:%[0-9]+]]:_(s32) = G_CONSTANT i32 63
- ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[MV]], [[C1]](s32)
+ ; GFX9-NEXT: [[ASHR:%[0-9]+]]:_(s64) = G_ASHR [[COPY2]], [[C1]](s32)
; GFX9-NEXT: [[C2:%[0-9]+]]:_(s64) = G_CONSTANT i64 -9223372036854775808
; GFX9-NEXT: [[UV8:%[0-9]+]]:_(s32), [[UV9:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR]](s64)
; GFX9-NEXT: [[UV10:%[0-9]+]]:_(s32), [[UV11:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64)
; GFX9-NEXT: [[UADDO:%[0-9]+]]:_(s32), [[UADDO1:%[0-9]+]]:_(s1) = G_UADDO [[UV8]], [[UV10]]
; GFX9-NEXT: [[UADDE:%[0-9]+]]:_(s32), [[UADDE1:%[0-9]+]]:_(s1) = G_UADDE [[UV9]], [[UV11]], [[UADDO1]]
; GFX9-NEXT: [[MV1:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO]](s32), [[UADDE]](s32)
- ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[MV]]
+ ; GFX9-NEXT: [[SELECT:%[0-9]+]]:_(s64) = G_SELECT [[XOR]](s1), [[MV1]], [[COPY2]]
; GFX9-NEXT: [[UV12:%[0-9]+]]:_(s32), [[UV13:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV1]](s64)
; GFX9-NEXT: [[UV14:%[0-9]+]]:_(s32), [[UV15:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[UV3]](s64)
; GFX9-NEXT: [[USUBO2:%[0-9]+]]:_(s32), [[USUBO3:%[0-9]+]]:_(s1) = G_USUBO [[UV12]], [[UV14]]
@@ -1146,13 +1154,14 @@ body: |
; GFX9-NEXT: [[ICMP2:%[0-9]+]]:_(s1) = G_ICMP intpred(slt), [[MV2]](s64), [[UV1]]
; GFX9-NEXT: [[ICMP3:%[0-9]+]]:_(s1) = G_ICMP intpred(sgt), [[UV3]](s64), [[C]]
; GFX9-NEXT: [[XOR1:%[0-9]+]]:_(s1) = G_XOR [[ICMP3]], [[ICMP2]]
- ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[MV2]], [[C1]](s32)
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:_(s64) = COPY [[MV2]](s64)
+ ; GFX9-NEXT: [[ASHR1:%[0-9]+]]:_(s64) = G_ASHR [[COPY3]], [[C1]](s32)
; GFX9-NEXT: [[UV16:%[0-9]+]]:_(s32), [[UV17:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[ASHR1]](s64)
; GFX9-NEXT: [[UV18:%[0-9]+]]:_(s32), [[UV19:%[0-9]+]]:_(s32) = G_UNMERGE_VALUES [[C2]](s64)
; GFX9-NEXT: [[UADDO2:%[0-9]+]]:_(s32), [[UADDO3:%[0-9]+]]:_(s1) = G_UADDO [[UV16]], [[UV18]]
; GFX9-NEXT: [[UADDE2:%[0-9]+]]:_(s32), [[UADDE3:%[0-9]+]]:_(s1) = G_UADDE [[UV17]], [[UV19]], [[UADDO3]]
; GFX9-NEXT: [[MV3:%[0-9]+]]:_(s64) = G_MERGE_VALUES [[UADDO2]](s32), [[UADDE2]](s32)
- ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[MV2]]
+ ; GFX9-NEXT: [[SELECT1:%[0-9]+]]:_(s64) = G_SELECT [[XOR1]](s1), [[MV3]], [[COPY3]]
; GFX9-NEXT: [[BUILD_VECTOR:%[0-9]+]]:_(<2 x s64>) = G_BUILD_VECTOR [[SELECT]](s64), [[SELECT1]](s64)
; GFX9-NEXT: $vgpr0_vgpr1_vgpr2_vgpr3 = COPY [[BUILD_VECTOR]](<2 x s64>)
%0:_(<2 x s64>) = COPY $vgpr0_vgpr1_vgpr2_vgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap.mir
index b4bc648..305eca7 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/legalize-trap.mir
@@ -24,7 +24,7 @@ body: |
bb.0:
%0:_(s8) = G_CONSTANT i8 0
%1:_(p1) = G_CONSTANT i64 0
- G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.trap)
+ G_TRAP
bb.1:
G_STORE %0, %1 :: (store 1, addrspace 1)
@@ -55,7 +55,7 @@ body: |
; GCN-NEXT: S_ENDPGM 0
bb.0:
%0:_(s8) = G_CONSTANT i8 0
- G_INTRINSIC_W_SIDE_EFFECTS intrinsic(@llvm.trap)
+ G_TRAP
%1:_(p1) = G_CONSTANT i64 0
bb.1:
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
index 623360f..de46037 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll
@@ -147,6 +147,34 @@ main_body:
ret half %res
}
+define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
+; GCN-LABEL: v_interp_rtz_f16:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 s3, exec_lo
+; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: lds_param_load v1, attr0.x wait_vdst:15
+; GCN-NEXT: s_mov_b32 exec_lo, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GCN-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
+; GCN-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
+; GCN-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
+; GCN-NEXT: v_add_f16_e32 v0, v3, v0
+; GCN-NEXT: ; return to shader part epilog
+main_body:
+ %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
+ %l_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 0)
+ %l_p1 = call half @llvm.amdgcn.interp.p2.rtz.f16(float %p0, float %j, float %l_p0, i1 0)
+ %h_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 1)
+ %h_p1 = call half @llvm.amdgcn.interp.p2.rtz.f16(float %p0, float %j, float %h_p0, i1 1)
+ %res = fadd half %l_p1, %h_p1
+ ret half %res
+}
+
define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #0 {
; GCN-LABEL: v_interp_f16_imm_params:
; GCN: ; %bb.0: ; %main_body
@@ -172,6 +200,8 @@ declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #0
declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #0
declare float @llvm.amdgcn.interp.inreg.p10.f16(float, float, float, i1) #0
declare half @llvm.amdgcn.interp.inreg.p2.f16(float, float, float, i1) #0
+declare float @llvm.amdgcn.interp.p10.rtz.f16(float, float, float, i1) #0
+declare half @llvm.amdgcn.interp.p2.rtz.f16(float, float, float, i1) #0
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
declare void @llvm.amdgcn.exp.f16(i32, i32, float, float, float, float, i1, i1) #0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
index 6eed92b..6d4aa3b 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.sbfe.ll
@@ -670,36 +670,19 @@ define amdgpu_kernel void @bfe_sext_in_reg_i24(ptr addrspace(1) %out, ptr addrsp
define amdgpu_kernel void @simplify_demanded_bfe_sdiv(ptr addrspace(1) %out, ptr addrspace(1) %in) #0 {
; GFX6-LABEL: simplify_demanded_bfe_sdiv:
; GFX6: ; %bb.0:
-; GFX6-NEXT: v_rcp_iflag_f32_e32 v0, 2.0
-; GFX6-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x0
-; GFX6-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0
-; GFX6-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GFX6-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x0
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_load_dword s0, s[6:7], 0x0
-; GFX6-NEXT: s_mov_b32 s6, -1
-; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: v_mul_lo_u32 v1, v0, -2
-; GFX6-NEXT: s_waitcnt lgkmcnt(0)
-; GFX6-NEXT: s_bfe_i32 s0, s0, 0x100001
-; GFX6-NEXT: s_ashr_i32 s2, s0, 31
-; GFX6-NEXT: v_mul_hi_u32 v1, v0, v1
-; GFX6-NEXT: s_add_i32 s0, s0, s2
-; GFX6-NEXT: s_xor_b32 s0, s0, s2
-; GFX6-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GFX6-NEXT: v_mul_hi_u32 v0, s0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v1, 1, v0
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0
-; GFX6-NEXT: v_sub_i32_e32 v1, vcc, s0, v1
-; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 2, v1
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT: v_subrev_i32_e64 v2, s[0:1], 2, v1
-; GFX6-NEXT: v_cndmask_b32_e32 v1, v1, v2, vcc
-; GFX6-NEXT: v_add_i32_e32 v2, vcc, 1, v0
-; GFX6-NEXT: v_cmp_le_u32_e32 vcc, 2, v1
-; GFX6-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
-; GFX6-NEXT: v_xor_b32_e32 v0, s2, v0
-; GFX6-NEXT: v_subrev_i32_e32 v0, vcc, s2, v0
-; GFX6-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; GFX6-NEXT: s_load_dword s3, s[2:3], 0x0
+; GFX6-NEXT: s_mov_b32 s2, -1
+; GFX6-NEXT: s_waitcnt lgkmcnt(0)
+; GFX6-NEXT: s_bfe_i32 s3, s3, 0x100001
+; GFX6-NEXT: s_ashr_i32 s4, s3, 31
+; GFX6-NEXT: s_lshr_b32 s4, s4, 31
+; GFX6-NEXT: s_add_i32 s3, s3, s4
+; GFX6-NEXT: s_ashr_i32 s3, s3, 1
+; GFX6-NEXT: v_mov_b32_e32 v0, s3
+; GFX6-NEXT: s_mov_b32 s3, 0xf000
+; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0
; GFX6-NEXT: s_endpgm
%src = load i32, ptr addrspace(1) %in, align 4
%bfe = call i32 @llvm.amdgcn.sbfe.i32(i32 %src, i32 1, i32 16)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll
index 686b849..06bd45a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.buffer.load.format.ll
@@ -1,6 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX8 %s
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1200 -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX12 %s
+; Note that TFE instructions don't have the result initialization to zero due to stopping before finalize-isel - which is where that's inserted
define amdgpu_ps float @struct_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(<4 x i32> inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; GFX8-LABEL: name: struct_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll
index 9edc2455..1e3f94a 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.struct.ptr.buffer.load.format.ll
@@ -1,5 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji -stop-after=instruction-select -verify-machineinstrs -o - %s | FileCheck %s
+; Note that TFE instructions don't have the result initialization to zero due to stopping before finalize-isel - which is where that's inserted
define amdgpu_ps float @struct_ptr_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset(ptr addrspace(8) inreg %rsrc, i32 %vindex, i32 %voffset, i32 inreg %soffset) {
; CHECK-LABEL: name: struct_ptr_buffer_load_format_f32__sgpr_rsrc__vgpr_vindex__vgpr_voffset__sgpr_soffset
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
index d36f5c0..a6f9bb7e 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll
@@ -4142,11 +4142,11 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4162,7 +4162,7 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1]
-; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6
; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4
; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
@@ -4179,7 +4179,7 @@ define i48 @v_saddsat_i48(i48 %lhs, i48 %rhs) {
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, 0, v[2:3]
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6
; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0
; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4202,7 +4202,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
; GFX6-NEXT: s_ashr_i32 s2, s7, 31
; GFX6-NEXT: s_ashr_i32 s5, s7, 15
-; GFX6-NEXT: s_add_u32 s2, s2, 0xffff8000
+; GFX6-NEXT: s_addk_i32 s2, 0x8000
; GFX6-NEXT: v_mov_b32_e32 v0, s5
; GFX6-NEXT: v_mov_b32_e32 v1, s2
; GFX6-NEXT: v_mov_b32_e32 v2, s4
@@ -4227,7 +4227,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
; GFX8-NEXT: s_ashr_i32 s2, s7, 31
; GFX8-NEXT: s_ashr_i32 s5, s7, 15
-; GFX8-NEXT: s_add_u32 s2, s2, 0xffff8000
+; GFX8-NEXT: s_addk_i32 s2, 0x8000
; GFX8-NEXT: v_mov_b32_e32 v0, s5
; GFX8-NEXT: v_mov_b32_e32 v1, s2
; GFX8-NEXT: v_mov_b32_e32 v2, s4
@@ -4250,7 +4250,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
; GFX9-NEXT: s_ashr_i32 s2, s5, 31
-; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -4274,7 +4274,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: s_ashr_i32 s2, s5, 31
-; GFX10-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX10-NEXT: s_xor_b32 s0, s1, s0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4293,7 +4293,7 @@ define amdgpu_ps i48 @s_saddsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
; GFX11-NEXT: s_ashr_i32 s2, s5, 31
-; GFX11-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX11-NEXT: s_xor_b32 s0, s1, s0
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4351,11 +4351,11 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1]
+; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1]
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4371,7 +4371,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1]
-; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4388,7 +4388,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4442,15 +4442,15 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX9-LABEL: saddsat_i48_vs:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
-; GFX9-NEXT: s_lshl_b64 s[2:3], s[0:1], 16
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s2, v0
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1]
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[2:3], 0
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4466,7 +4466,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4483,7 +4483,7 @@ define amdgpu_ps <2 x float> @saddsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4529,11 +4529,11 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v0, v2
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[2:3]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[2:3]
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -4546,7 +4546,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[2:3]
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6
; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
@@ -4560,7 +4560,7 @@ define i64 @v_saddsat_i64(i64 %lhs, i64 %rhs) {
; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[2:3]
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -4578,7 +4578,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
; GFX6-NEXT: s_ashr_i32 s2, s5, 31
-; GFX6-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX6-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: v_mov_b32_e32 v2, s4
@@ -4599,7 +4599,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
; GFX8-NEXT: s_ashr_i32 s2, s5, 31
-; GFX8-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX8-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s4
@@ -4620,7 +4620,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0
; GFX9-NEXT: s_ashr_i32 s2, s5, 31
-; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -4641,7 +4641,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: s_ashr_i32 s2, s5, 31
-; GFX10-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX10-NEXT: s_xor_b32 s0, s1, s0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4657,7 +4657,7 @@ define amdgpu_ps i64 @s_saddsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[2:3], 0
; GFX11-NEXT: s_ashr_i32 s2, s5, 31
-; GFX11-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX11-NEXT: s_xor_b32 s0, s1, s0
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4702,11 +4702,11 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v1, vcc
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], 0, v[0:1]
+; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], 0, v[0:1]
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX9-NEXT: ; return to shader part epilog
@@ -4718,7 +4718,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1]
-; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4731,7 +4731,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
; GFX11-NEXT: ; return to shader part epilog
@@ -4774,11 +4774,11 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, s0, v0
; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[0:1], 0
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX9-NEXT: ; return to shader part epilog
@@ -4790,7 +4790,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4803,7 +4803,7 @@ define amdgpu_ps <2 x float> @saddsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[0:1], 0
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
; GFX11-NEXT: ; return to shader part epilog
@@ -4866,21 +4866,20 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_add_co_u32_e32 v8, vcc, v0, v4
; GFX9-NEXT: v_addc_co_u32_e32 v9, vcc, v1, v5, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1]
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[4:5]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1]
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[4:5]
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9
-; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1
-; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
; GFX9-NEXT: v_add_co_u32_e32 v4, vcc, v2, v6
; GFX9-NEXT: v_addc_co_u32_e32 v5, vcc, v3, v7, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3]
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[6:7], 0, v[6:7]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3]
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[4:5], 0, v[6:7]
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v2
-; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2
+; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -4896,10 +4895,10 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
; GFX10-NEXT: v_cmp_gt_i64_e64 s4, 0, v[4:5]
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v11
-; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[6:7]
-; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v12
; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3]
-; GFX10-NEXT: v_add_co_u32 v3, s7, 0x80000000, v4
+; GFX10-NEXT: v_cmp_gt_i64_e64 s6, 0, v[6:7]
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12
+; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4
; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo
@@ -4921,8 +4920,8 @@ define <2 x i64> @v_saddsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v11
; GFX11-NEXT: v_cmp_lt_i64_e64 s1, v[10:11], v[2:3]
; GFX11-NEXT: v_cmp_gt_i64_e64 s2, 0, v[6:7]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v12
-; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v12 :: v_dual_cndmask_b32 v1, v9, v1
; GFX11-NEXT: s_xor_b32 vcc_lo, s2, s1
@@ -4942,7 +4941,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
; GFX6-NEXT: s_ashr_i32 s4, s9, 31
-; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: v_mov_b32_e32 v2, s8
@@ -4957,7 +4956,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
; GFX6-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
; GFX6-NEXT: s_ashr_i32 s4, s1, 31
-; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: v_mov_b32_e32 v4, s0
@@ -4980,7 +4979,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
; GFX8-NEXT: s_ashr_i32 s4, s9, 31
-; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s8
@@ -4995,7 +4994,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
; GFX8-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
; GFX8-NEXT: s_ashr_i32 s4, s1, 31
-; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v4, s0
@@ -5018,7 +5017,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0
; GFX9-NEXT: s_ashr_i32 s4, s9, 31
-; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s8
@@ -5033,7 +5032,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], s[6:7], 0
; GFX9-NEXT: s_ashr_i32 s4, s1, 31
-; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v4, s0
@@ -5056,7 +5055,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], 0
; GFX10-NEXT: s_ashr_i32 s4, s9, 31
; GFX10-NEXT: v_mov_b32_e32 v1, s9
-; GFX10-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX10-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX10-NEXT: s_xor_b32 s8, s1, s0
; GFX10-NEXT: s_add_u32 s0, s2, s6
; GFX10-NEXT: s_addc_u32 s1, s3, s7
@@ -5067,7 +5066,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8
; GFX10-NEXT: s_ashr_i32 s4, s1, 31
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8
-; GFX10-NEXT: s_add_u32 s0, s4, 0x80000000
+; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000
; GFX10-NEXT: s_xor_b32 s1, s3, s2
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1
@@ -5085,7 +5084,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], 0
; GFX11-NEXT: s_ashr_i32 s4, s9, 31
-; GFX11-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX11-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX11-NEXT: s_xor_b32 s8, s1, s0
; GFX11-NEXT: s_add_u32 s0, s2, s6
; GFX11-NEXT: s_addc_u32 s1, s3, s7
@@ -5095,7 +5094,7 @@ define amdgpu_ps <2 x i64> @s_saddsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8
; GFX11-NEXT: s_ashr_i32 s4, s1, 31
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8
-; GFX11-NEXT: s_add_u32 s0, s4, 0x80000000
+; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000
; GFX11-NEXT: s_xor_b32 s1, s3, s2
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1
; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1
@@ -5132,7 +5131,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX6-NEXT: s_ashr_i32 s0, s9, 31
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: v_mov_b32_e32 v2, s4
; GFX6-NEXT: v_mov_b32_e32 v3, s5
@@ -5179,7 +5178,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX8-NEXT: s_ashr_i32 s0, s9, 31
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: v_mov_b32_e32 v2, s4
; GFX8-NEXT: v_mov_b32_e32 v3, s5
@@ -5226,7 +5225,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: s_ashr_i32 s0, s9, 31
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s4
; GFX9-NEXT: v_mov_b32_e32 v3, s5
@@ -5269,7 +5268,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
; GFX10-NEXT: v_mov_b32_e32 v2, s5
; GFX10-NEXT: s_ashr_i32 s0, s9, 31
-; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX10-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX10-NEXT: v_mov_b32_e32 v1, s4
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
@@ -5310,7 +5309,7 @@ define amdgpu_ps i128 @s_saddsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
; GFX11-NEXT: v_mov_b32_e32 v2, s5
; GFX11-NEXT: s_ashr_i32 s0, s9, 31
-; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX11-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX11-NEXT: v_dual_mov_b32 v1, s4 :: v_dual_and_b32 v0, 1, v0
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
@@ -5412,9 +5411,8 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX9-NEXT: v_ashrrev_i32_e32 v3, 31, v5
; GFX9-NEXT: v_cndmask_b32_e64 v2, v7, 0, vcc
; GFX9-NEXT: v_xor_b32_e32 v2, v2, v6
-; GFX9-NEXT: v_bfrev_b32_e32 v6, 1
-; GFX9-NEXT: v_add_co_u32_e32 v6, vcc, v3, v6
; GFX9-NEXT: v_and_b32_e32 v2, 1, v2
+; GFX9-NEXT: v_add_u32_e32 v6, 0x80000000, v3
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
@@ -5440,7 +5438,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v5
; GFX10-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo
; GFX10-NEXT: v_xor_b32_e32 v2, v2, v6
-; GFX10-NEXT: v_add_co_u32 v6, s0, 0x80000000, v3
+; GFX10-NEXT: v_add_nc_u32_e32 v6, 0x80000000, v3
; GFX10-NEXT: v_and_b32_e32 v2, 1, v2
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v3, vcc_lo
@@ -5467,7 +5465,7 @@ define amdgpu_ps <4 x float> @saddsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v5
; GFX11-NEXT: v_cndmask_b32_e64 v2, v8, 0, vcc_lo
; GFX11-NEXT: v_xor_b32_e32 v2, v2, v6
-; GFX11-NEXT: v_add_co_u32 v6, null, 0x80000000, v3
+; GFX11-NEXT: v_add_nc_u32_e32 v6, 0x80000000, v3
; GFX11-NEXT: v_and_b32_e32 v2, 1, v2
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2
; GFX11-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc_lo
@@ -5569,9 +5567,8 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, s[0:1]
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7
-; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
@@ -5597,9 +5594,9 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7
+; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0
-; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2
; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
@@ -5627,15 +5624,14 @@ define amdgpu_ps <4 x float> @saddsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7
-; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v2
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2
; GFX11-NEXT: v_cndmask_b32_e64 v1, v8, 0, s0
; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc_lo
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v6, v2 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v2 :: v_dual_cndmask_b32 v3, v7, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo
; GFX11-NEXT: ; return to shader part epilog
%result = call i128 @llvm.sadd.sat.i128(i128 %lhs, i128 %rhs)
%cast = bitcast i128 %result to <4 x float>
@@ -5762,12 +5758,11 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v17
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[10:11]
+; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2
; GFX9-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, 0, vcc
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
-; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc
@@ -5786,11 +5781,11 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v11
; GFX9-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc
; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, 0, v[14:15]
+; GFX9-NEXT: v_add_u32_e32 v7, 0x80000000, v6
; GFX9-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, 0, vcc
; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4
-; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 0x80000000, v6
; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc
@@ -5832,18 +5827,18 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v19
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
-; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v17
-; GFX10-NEXT: v_add_co_u32 v7, s5, 0x80000000, v6
+; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6
; GFX10-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v4, s4, 0x80000000, v3
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v3, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v16, v3, vcc_lo
-; GFX10-NEXT: v_and_b32_e32 v5, 1, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc_lo
+; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v17
+; GFX10-NEXT: v_and_b32_e32 v3, 1, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v2, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v3
; GFX10-NEXT: v_cndmask_b32_e32 v3, v17, v4, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v5
; GFX10-NEXT: v_cndmask_b32_e64 v4, v12, v6, s4
; GFX10-NEXT: v_cndmask_b32_e64 v5, v13, v6, s4
; GFX10-NEXT: v_cndmask_b32_e64 v6, v18, v6, s4
@@ -5882,18 +5877,17 @@ define <2 x i128> @v_saddsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v19
; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
-; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v17
-; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_add_co_u32 v7, null, 0x80000000, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6
; GFX11-NEXT: v_cndmask_b32_e64 v2, v4, 0, vcc_lo
-; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: v_add_co_u32 v4, null, 0x80000000, v3
; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v3, vcc_lo
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v3 :: v_dual_and_b32 v5, 1, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v3, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v17, v4, vcc_lo
-; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v5
+; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v17
+; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2
+; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v2 :: v_dual_and_b32 v3, 1, v1
+; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v2, vcc_lo
+; GFX11-NEXT: v_dual_cndmask_b32 v2, v16, v2 :: v_dual_cndmask_b32 v3, v17, v4
; GFX11-NEXT: v_cndmask_b32_e64 v4, v12, v6, s0
; GFX11-NEXT: v_cndmask_b32_e64 v5, v13, v6, s0
; GFX11-NEXT: v_cndmask_b32_e64 v6, v18, v6, s0
@@ -5927,7 +5921,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX6-NEXT: s_ashr_i32 s0, s17, 31
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: v_mov_b32_e32 v3, s9
@@ -5960,7 +5954,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX6-NEXT: s_ashr_i32 s4, s3, 31
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v1, s4
; GFX6-NEXT: v_mov_b32_e32 v2, s0
; GFX6-NEXT: v_mov_b32_e32 v3, s1
@@ -6011,7 +6005,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX8-NEXT: s_ashr_i32 s0, s17, 31
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: v_mov_b32_e32 v2, s8
; GFX8-NEXT: v_mov_b32_e32 v3, s9
@@ -6050,7 +6044,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX8-NEXT: s_ashr_i32 s4, s3, 31
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
@@ -6101,7 +6095,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: s_ashr_i32 s0, s17, 31
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s8
; GFX9-NEXT: v_mov_b32_e32 v3, s9
@@ -6140,7 +6134,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: s_ashr_i32 s4, s3, 31
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -6184,7 +6178,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX10-NEXT: s_and_b32 s1, 1, s1
; GFX10-NEXT: s_ashr_i32 s10, s17, 31
; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
-; GFX10-NEXT: s_add_u32 s11, s10, 0x80000000
+; GFX10-NEXT: s_add_i32 s11, s10, 0x80000000
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
; GFX10-NEXT: s_add_u32 s0, s4, s12
@@ -6221,7 +6215,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1
; GFX10-NEXT: v_mov_b32_e32 v2, s17
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s10, vcc_lo
-; GFX10-NEXT: s_add_u32 s0, s4, 0x80000000
+; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000
; GFX10-NEXT: v_readfirstlane_b32 s1, v4
; GFX10-NEXT: v_and_b32_e32 v1, 1, v1
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo
@@ -6261,7 +6255,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX11-NEXT: s_and_b32 s1, 1, s1
; GFX11-NEXT: s_ashr_i32 s10, s17, 31
; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1
-; GFX11-NEXT: s_add_u32 s11, s10, 0x80000000
+; GFX11-NEXT: s_add_i32 s11, s10, 0x80000000
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0
; GFX11-NEXT: s_add_u32 s0, s4, s12
@@ -6299,7 +6293,7 @@ define amdgpu_ps <2 x i128> @s_saddsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX11-NEXT: v_and_b32_e32 v1, 1, v1
; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, s10, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s11, vcc_lo
-; GFX11-NEXT: s_add_u32 s0, s4, 0x80000000
+; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v1
; GFX11-NEXT: v_mov_b32_e32 v1, s2
; GFX11-NEXT: v_readfirstlane_b32 s1, v4
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
index 1061f00..2c2f8e9 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i32.ll
@@ -279,125 +279,27 @@ define i32 @v_sdiv_i32_pow2k_denom(i32 %num) {
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CHECK-NEXT: v_ashrrev_i32_e32 v1, 31, v0
-; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, 0x45800000
-; CHECK-NEXT: v_mov_b32_e32 v3, 0xfffff000
-; CHECK-NEXT: v_mov_b32_e32 v4, 0x1000
+; CHECK-NEXT: v_lshrrev_b32_e32 v1, 20, v1
; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT: v_mul_f32_e32 v2, 0x4f7ffffe, v2
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1
-; CHECK-NEXT: v_cvt_u32_f32_e32 v2, v2
-; CHECK-NEXT: v_mul_lo_u32 v3, v2, v3
-; CHECK-NEXT: v_mul_hi_u32 v3, v2, v3
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; CHECK-NEXT: v_mul_hi_u32 v2, v0, v2
-; CHECK-NEXT: v_lshlrev_b32_e32 v3, 12, v2
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, 1, v2
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4
-; CHECK-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[4:5]
-; CHECK-NEXT: v_subrev_i32_e32 v3, vcc, 0x1000, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v0, v0, v3, s[4:5]
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, 1, v2
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v1
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
+; CHECK-NEXT: v_ashrrev_i32_e32 v0, 12, v0
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = sdiv i32 %num, 4096
ret i32 %result
}
define <2 x i32> @v_sdiv_v2i32_pow2k_denom(<2 x i32> %num) {
-; GISEL-LABEL: v_sdiv_v2i32_pow2k_denom:
-; GISEL: ; %bb.0:
-; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; GISEL-NEXT: v_mov_b32_e32 v3, 0x1000
-; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x1000
-; GISEL-NEXT: v_mov_b32_e32 v5, 0xfffff000
-; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v6
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT: v_mul_f32_e32 v4, 0x4f7ffffe, v4
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6
-; GISEL-NEXT: v_cvt_u32_f32_e32 v4, v4
-; GISEL-NEXT: v_mul_lo_u32 v5, v4, v5
-; GISEL-NEXT: v_mul_hi_u32 v5, v4, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_mul_hi_u32 v5, v0, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, v1, v4
-; GISEL-NEXT: v_lshlrev_b32_e32 v7, 12, v5
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v5
-; GISEL-NEXT: v_lshlrev_b32_e32 v9, 12, v4
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, 1, v4
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v9
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v5, v5, v8, s[4:5]
-; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v0, v3
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v4, v4, v10, s[6:7]
-; GISEL-NEXT: v_subrev_i32_e32 v8, vcc, 0x1000, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7]
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v4
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v0, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v0, v5, v7, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v1, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v4, v8, vcc
-; GISEL-NEXT: v_xor_b32_e32 v0, v0, v2
-; GISEL-NEXT: v_xor_b32_e32 v1, v1, v6
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT: v_sub_i32_e32 v1, vcc, v1, v6
-; GISEL-NEXT: s_setpc_b64 s[30:31]
-;
-; CGP-LABEL: v_sdiv_v2i32_pow2k_denom:
-; CGP: ; %bb.0:
-; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_ashrrev_i32_e32 v2, 31, v0
-; CGP-NEXT: v_rcp_iflag_f32_e32 v3, 0x45800000
-; CGP-NEXT: v_mov_b32_e32 v4, 0xfffff000
-; CGP-NEXT: v_mov_b32_e32 v5, 0x1000
-; CGP-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CGP-NEXT: v_mul_f32_e32 v3, 0x4f7ffffe, v3
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v6
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v2
-; CGP-NEXT: v_cvt_u32_f32_e32 v3, v3
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v6
-; CGP-NEXT: v_mul_lo_u32 v4, v3, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v3, v4
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CGP-NEXT: v_mul_hi_u32 v4, v0, v3
-; CGP-NEXT: v_mul_hi_u32 v3, v1, v3
-; CGP-NEXT: v_lshlrev_b32_e32 v7, 12, v4
-; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v4
-; CGP-NEXT: v_lshlrev_b32_e32 v9, 12, v3
-; CGP-NEXT: v_add_i32_e32 v10, vcc, 1, v3
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v7
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v9
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5
-; CGP-NEXT: v_cndmask_b32_e64 v4, v4, v8, s[4:5]
-; CGP-NEXT: v_sub_i32_e32 v7, vcc, v0, v5
-; CGP-NEXT: v_cmp_ge_u32_e64 s[6:7], v1, v5
-; CGP-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[6:7]
-; CGP-NEXT: v_subrev_i32_e32 v8, vcc, 0x1000, v1
-; CGP-NEXT: v_cndmask_b32_e64 v0, v0, v7, s[4:5]
-; CGP-NEXT: v_add_i32_e32 v7, vcc, 1, v4
-; CGP-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7]
-; CGP-NEXT: v_add_i32_e32 v8, vcc, 1, v3
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5
-; CGP-NEXT: v_cndmask_b32_e32 v0, v4, v7, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v1, v5
-; CGP-NEXT: v_cndmask_b32_e32 v1, v3, v8, vcc
-; CGP-NEXT: v_xor_b32_e32 v0, v0, v2
-; CGP-NEXT: v_xor_b32_e32 v1, v1, v6
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
-; CGP-NEXT: v_sub_i32_e32 v1, vcc, v1, v6
-; CGP-NEXT: s_setpc_b64 s[30:31]
+; CHECK-LABEL: v_sdiv_v2i32_pow2k_denom:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v0
+; CHECK-NEXT: v_ashrrev_i32_e32 v3, 31, v1
+; CHECK-NEXT: v_lshrrev_b32_e32 v2, 20, v2
+; CHECK-NEXT: v_lshrrev_b32_e32 v3, 20, v3
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v3
+; CHECK-NEXT: v_ashrrev_i32_e32 v0, 12, v0
+; CHECK-NEXT: v_ashrrev_i32_e32 v1, 12, v1
+; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = sdiv <2 x i32> %num, <i32 4096, i32 4096>
ret <2 x i32> %result
}
@@ -884,3 +786,24 @@ define <2 x i32> @v_sdiv_v2i32_24bit(<2 x i32> %num, <2 x i32> %den) {
%result = sdiv <2 x i32> %num.mask, %den.mask
ret <2 x i32> %result
}
+
+define i32 @v_sdiv_i32_exact(i32 %num) {
+; CHECK-LABEL: v_sdiv_i32_exact:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_ashrrev_i32_e32 v0, 12, v0
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %result = sdiv exact i32 %num, 4096
+ ret i32 %result
+}
+
+define <2 x i32> @v_sdiv_v2i32_exact(<2 x i32> %num) {
+; CHECK-LABEL: v_sdiv_v2i32_exact:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_ashrrev_i32_e32 v0, 12, v0
+; CHECK-NEXT: v_ashrrev_i32_e32 v1, 10, v1
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %result = sdiv exact <2 x i32> %num, <i32 4096, i32 1024>
+ ret <2 x i32> %result
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
index 0a6b7af..377fa24 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdiv.i64.ll
@@ -999,126 +999,11 @@ define i64 @v_sdiv_i64_pow2k_denom(i64 %num) {
; CHECK-LABEL: v_sdiv_i64_pow2k_denom:
; CHECK: ; %bb.0:
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: v_cvt_f32_u32_e32 v2, 0x1000
-; CHECK-NEXT: v_cvt_f32_ubyte0_e32 v3, 0
-; CHECK-NEXT: v_mov_b32_e32 v6, 0xfffff000
-; CHECK-NEXT: v_mac_f32_e32 v2, 0x4f800000, v3
-; CHECK-NEXT: v_rcp_iflag_f32_e32 v2, v2
-; CHECK-NEXT: v_mul_f32_e32 v2, 0x5f7ffffc, v2
-; CHECK-NEXT: v_mul_f32_e32 v3, 0x2f800000, v2
-; CHECK-NEXT: v_trunc_f32_e32 v4, v3
-; CHECK-NEXT: v_mac_f32_e32 v2, 0xcf800000, v4
-; CHECK-NEXT: v_cvt_u32_f32_e32 v5, v2
-; CHECK-NEXT: v_cvt_u32_f32_e32 v7, v4
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v2
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_mul_lo_u32 v4, v7, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v9, v5, v3
-; CHECK-NEXT: v_mul_lo_u32 v10, v7, v3
-; CHECK-NEXT: v_mul_hi_u32 v11, v5, v3
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v9
-; CHECK-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v10, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v9, v4
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v11
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v8, vcc, v10, v8
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v4
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, v8, v4
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v3, v4
-; CHECK-NEXT: v_add_i32_e32 v5, vcc, v5, v2
-; CHECK-NEXT: v_addc_u32_e32 v7, vcc, v7, v3, vcc
-; CHECK-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v6, v5, 0
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v6, v7, v[3:4]
-; CHECK-NEXT: v_ashrrev_i32_e32 v6, 31, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_mad_u64_u32 v[3:4], s[4:5], -1, v5, v[3:4]
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v1, v6, vcc
-; CHECK-NEXT: v_xor_b32_e32 v4, v0, v6
-; CHECK-NEXT: v_mul_lo_u32 v0, v7, v2
-; CHECK-NEXT: v_mul_lo_u32 v8, v5, v3
-; CHECK-NEXT: v_xor_b32_e32 v9, v1, v6
-; CHECK-NEXT: v_mul_hi_u32 v1, v5, v2
-; CHECK-NEXT: v_mul_hi_u32 v2, v7, v2
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v1, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; CHECK-NEXT: v_mul_hi_u32 v8, v5, v3
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v2
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v1, v8
-; CHECK-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v8
-; CHECK-NEXT: v_mul_hi_u32 v3, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v1, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v2, v1
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, v3, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; CHECK-NEXT: v_addc_u32_e32 v1, vcc, v7, v1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v2, v9, v0
-; CHECK-NEXT: v_mul_lo_u32 v3, v4, v1
-; CHECK-NEXT: v_mul_hi_u32 v7, v4, v0
-; CHECK-NEXT: v_mul_hi_u32 v0, v9, v0
-; CHECK-NEXT: v_mov_b32_e32 v5, 0x1000
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v2, v7
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_mul_lo_u32 v7, v9, v1
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT: v_mul_hi_u32 v3, v4, v1
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v7, v0
-; CHECK-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v3
-; CHECK-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CHECK-NEXT: v_add_i32_e32 v7, vcc, v0, v2
-; CHECK-NEXT: v_mul_hi_u32 v8, v9, v1
-; CHECK-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v7, 0
-; CHECK-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CHECK-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CHECK-NEXT: v_add_i32_e32 v3, vcc, v8, v2
-; CHECK-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v5, v3, v[1:2]
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v4, v0
-; CHECK-NEXT: v_subb_u32_e64 v2, s[4:5], v9, v1, vcc
-; CHECK-NEXT: v_sub_i32_e64 v1, s[4:5], v9, v1
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v4, 0, -1, s[4:5]
-; CHECK-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2
-; CHECK-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v1, vcc
-; CHECK-NEXT: v_cndmask_b32_e64 v2, -1, v4, s[4:5]
-; CHECK-NEXT: v_add_i32_e32 v4, vcc, 1, v7
-; CHECK-NEXT: v_addc_u32_e32 v8, vcc, 0, v3, vcc
-; CHECK-NEXT: v_cmp_ge_u32_e32 vcc, v0, v5
-; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, -1, vcc
-; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1
-; CHECK-NEXT: v_cndmask_b32_e32 v0, -1, v0, vcc
-; CHECK-NEXT: v_add_i32_e32 v1, vcc, 1, v4
-; CHECK-NEXT: v_addc_u32_e32 v5, vcc, 0, v8, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v8, v5, vcc
-; CHECK-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; CHECK-NEXT: v_cndmask_b32_e32 v0, v7, v0, vcc
-; CHECK-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
-; CHECK-NEXT: v_xor_b32_e32 v0, v0, v6
-; CHECK-NEXT: v_xor_b32_e32 v1, v1, v6
-; CHECK-NEXT: v_sub_i32_e32 v0, vcc, v0, v6
-; CHECK-NEXT: v_subb_u32_e32 v1, vcc, v1, v6, vcc
+; CHECK-NEXT: v_ashrrev_i32_e32 v2, 31, v1
+; CHECK-NEXT: v_lshrrev_b32_e32 v2, 20, v2
+; CHECK-NEXT: v_add_i32_e32 v0, vcc, v0, v2
+; CHECK-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CHECK-NEXT: v_ashr_i64 v[0:1], v[0:1], 12
; CHECK-NEXT: s_setpc_b64 s[30:31]
%result = sdiv i64 %num, 4096
ret i64 %result
@@ -1128,473 +1013,31 @@ define <2 x i64> @v_sdiv_v2i64_pow2k_denom(<2 x i64> %num) {
; GISEL-LABEL: v_sdiv_v2i64_pow2k_denom:
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GISEL-NEXT: v_cvt_f32_u32_e32 v4, 0x1000
-; GISEL-NEXT: v_cvt_f32_ubyte0_e32 v5, 0
-; GISEL-NEXT: s_sub_u32 s6, 0, 0x1000
-; GISEL-NEXT: s_subb_u32 s7, 0, 0
-; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
-; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT: v_trunc_f32_e32 v7, v5
-; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7
-; GISEL-NEXT: v_cvt_u32_f32_e32 v6, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v7, v7
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], s6, v6, 0
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[5:6]
-; GISEL-NEXT: v_mul_lo_u32 v5, v7, v4
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9]
-; GISEL-NEXT: v_mul_hi_u32 v9, v6, v4
-; GISEL-NEXT: v_mul_hi_u32 v4, v7, v4
-; GISEL-NEXT: v_mul_lo_u32 v10, v6, v8
-; GISEL-NEXT: v_mul_lo_u32 v11, v7, v8
-; GISEL-NEXT: v_mul_hi_u32 v12, v6, v8
-; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v10, v5
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v11, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v10
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v6, v4
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v11, 0
-; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v7, v5, vcc
-; GISEL-NEXT: v_mov_b32_e32 v4, v9
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s6, v5, v[4:5]
; GISEL-NEXT: v_ashrrev_i32_e32 v4, 31, v1
+; GISEL-NEXT: v_lshrrev_b32_e32 v4, 20, v4
+; GISEL-NEXT: v_ashrrev_i32_e32 v5, 31, v3
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; GISEL-NEXT: v_mad_u64_u32 v[9:10], s[4:5], s7, v11, v[9:10]
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v1, v4, vcc
-; GISEL-NEXT: v_xor_b32_e32 v10, v0, v4
-; GISEL-NEXT: v_mul_lo_u32 v0, v5, v8
-; GISEL-NEXT: v_mul_lo_u32 v12, v11, v9
-; GISEL-NEXT: v_xor_b32_e32 v13, v1, v4
-; GISEL-NEXT: v_mul_hi_u32 v1, v11, v8
-; GISEL-NEXT: v_mul_hi_u32 v8, v5, v8
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v5, v9
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v12, v0
-; GISEL-NEXT: v_mul_hi_u32 v12, v11, v9
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT: v_mul_hi_u32 v9, v5, v9
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v1, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v9, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT: v_addc_u32_e32 v1, vcc, v5, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v8, v13, v0
-; GISEL-NEXT: v_mul_lo_u32 v9, v10, v1
-; GISEL-NEXT: v_mul_hi_u32 v11, v10, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT: v_mov_b32_e32 v5, 0x1000
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v11, v13, v1
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_mul_hi_u32 v9, v10, v1
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v11, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v9
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v11, v9
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v0, v8
-; GISEL-NEXT: v_mul_hi_u32 v12, v13, v1
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v5, v11, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v9, v8
-; GISEL-NEXT: v_add_i32_e32 v12, vcc, v12, v8
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v5, v12, v[1:2]
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v10, v0
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], 0, v11, v[8:9]
-; GISEL-NEXT: s_sub_u32 s6, 0, 0x1000
-; GISEL-NEXT: s_subb_u32 s7, 0, 0
-; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], v13, v8, vcc
-; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], v13, v8
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
-; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v8, vcc
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v0, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v10, -1, v9, s[4:5]
-; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v11
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v6, 0
-; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v12, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v9
-; GISEL-NEXT: v_cndmask_b32_e32 v15, -1, v8, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s6, v7, v[1:2]
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, 1, v13
-; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], s7, v6, v[8:9]
-; GISEL-NEXT: v_addc_u32_e32 v16, vcc, 0, v14, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v15
-; GISEL-NEXT: v_cndmask_b32_e32 v9, v13, v1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v1, v7, v0
-; GISEL-NEXT: v_mul_lo_u32 v13, v6, v8
-; GISEL-NEXT: v_mul_hi_u32 v15, v6, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v14, v14, v16, vcc
-; GISEL-NEXT: v_mul_hi_u32 v0, v7, v0
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v1, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v15, v7, v8
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT: v_mul_hi_u32 v13, v6, v8
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v15, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v13
-; GISEL-NEXT: v_mul_hi_u32 v8, v7, v8
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v13, v1
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v6, v0
-; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v7, v1, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], s6, v8, 0
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
-; GISEL-NEXT: v_cndmask_b32_e32 v9, v11, v9, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s6, v13, v[1:2]
-; GISEL-NEXT: v_xor_b32_e32 v1, v9, v4
-; GISEL-NEXT: v_ashrrev_i32_e32 v9, 31, v3
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], s7, v8, v[6:7]
-; GISEL-NEXT: v_cndmask_b32_e32 v10, v12, v14, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v9
-; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v9, vcc
-; GISEL-NEXT: v_xor_b32_e32 v11, v2, v9
-; GISEL-NEXT: v_mul_lo_u32 v2, v13, v0
-; GISEL-NEXT: v_mul_lo_u32 v7, v8, v6
-; GISEL-NEXT: v_xor_b32_e32 v12, v3, v9
-; GISEL-NEXT: v_mul_hi_u32 v3, v8, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v13, v0
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v13, v6
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v7, v2
-; GISEL-NEXT: v_mul_hi_u32 v7, v8, v6
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v3, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; GISEL-NEXT: v_mul_hi_u32 v6, v13, v6
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; GISEL-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_addc_u32_e32 v2, vcc, v13, v2, vcc
-; GISEL-NEXT: v_mul_lo_u32 v3, v12, v0
-; GISEL-NEXT: v_mul_lo_u32 v6, v11, v2
-; GISEL-NEXT: v_mul_hi_u32 v7, v11, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v12, v0
-; GISEL-NEXT: v_xor_b32_e32 v8, v10, v4
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v3, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v7, v12, v2
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, v6, v3
-; GISEL-NEXT: v_mul_hi_u32 v6, v11, v2
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v3
-; GISEL-NEXT: v_mul_hi_u32 v7, v12, v2
-; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v10, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v7, v0
-; GISEL-NEXT: v_mov_b32_e32 v0, v3
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v5, v13, v[0:1]
-; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v1, v4
-; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v8, v4, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[4:5], 0, v10, v[6:7]
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v11, v2
-; GISEL-NEXT: v_subb_u32_e64 v4, s[4:5], v12, v3, vcc
-; GISEL-NEXT: v_sub_i32_e64 v3, s[4:5], v12, v3
-; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v5
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4
-; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v4, -1, v6, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, 1, v10
-; GISEL-NEXT: v_addc_u32_e32 v7, vcc, 0, v13, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v2, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v6
-; GISEL-NEXT: v_addc_u32_e32 v5, vcc, 0, v7, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v6, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v7, v5, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
-; GISEL-NEXT: v_xor_b32_e32 v2, v2, v9
-; GISEL-NEXT: v_xor_b32_e32 v3, v3, v9
-; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v2, v9
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 20, v5
+; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, v2, v5
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT: v_ashr_i64 v[0:1], v[0:1], 12
+; GISEL-NEXT: v_ashr_i64 v[2:3], v[2:3], 12
; GISEL-NEXT: s_setpc_b64 s[30:31]
;
; CGP-LABEL: v_sdiv_v2i64_pow2k_denom:
; CGP: ; %bb.0:
; CGP-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CGP-NEXT: v_cvt_f32_u32_e32 v4, 0x1000
-; CGP-NEXT: v_cvt_f32_ubyte0_e32 v5, 0
-; CGP-NEXT: v_mov_b32_e32 v6, 0xfffff000
-; CGP-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
-; CGP-NEXT: v_rcp_iflag_f32_e32 v4, v4
-; CGP-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; CGP-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; CGP-NEXT: v_trunc_f32_e32 v7, v5
-; CGP-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7
-; CGP-NEXT: v_cvt_u32_f32_e32 v8, v4
-; CGP-NEXT: v_cvt_u32_f32_e32 v9, v7
-; CGP-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v6, v8, 0
-; CGP-NEXT: v_mov_b32_e32 v7, v5
-; CGP-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v6, v9, v[7:8]
-; CGP-NEXT: v_mul_hi_u32 v12, v9, v4
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], -1, v8, v[10:11]
-; CGP-NEXT: v_mul_lo_u32 v10, v9, v4
-; CGP-NEXT: v_mul_hi_u32 v11, v8, v4
-; CGP-NEXT: v_mul_lo_u32 v4, v8, v13
-; CGP-NEXT: v_mul_lo_u32 v7, v9, v13
-; CGP-NEXT: v_mul_hi_u32 v14, v8, v13
-; CGP-NEXT: v_mul_hi_u32 v13, v9, v13
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v10, v4
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v4, v11
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v15, v4
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v12
-; CGP-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v7, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v15, v14
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v7, v4
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v14, v7
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v13, v7
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v8, v4
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v6, v16, 0
-; CGP-NEXT: v_addc_u32_e32 v17, vcc, v9, v7, vcc
-; CGP-NEXT: v_mov_b32_e32 v4, v14
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v6, v17, v[4:5]
-; CGP-NEXT: v_ashrrev_i32_e32 v7, 31, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v7
-; CGP-NEXT: v_mad_u64_u32 v[14:15], s[4:5], -1, v16, v[14:15]
-; CGP-NEXT: v_addc_u32_e32 v1, vcc, v1, v7, vcc
-; CGP-NEXT: v_xor_b32_e32 v15, v0, v7
-; CGP-NEXT: v_mul_lo_u32 v0, v17, v13
-; CGP-NEXT: v_mul_lo_u32 v4, v16, v14
-; CGP-NEXT: v_xor_b32_e32 v18, v1, v7
-; CGP-NEXT: v_mul_hi_u32 v1, v16, v13
-; CGP-NEXT: v_mul_hi_u32 v13, v17, v13
+; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v1
+; CGP-NEXT: v_lshrrev_b32_e32 v4, 20, v4
; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v1
-; CGP-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v1, v17, v14
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v4, v0
-; CGP-NEXT: v_mul_hi_u32 v4, v16, v14
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v13
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v4
-; CGP-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; CGP-NEXT: v_mul_hi_u32 v13, v17, v14
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v1, v0
-; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v4, v1
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v13, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT: v_addc_u32_e32 v1, vcc, v17, v1, vcc
-; CGP-NEXT: v_mul_lo_u32 v13, v18, v0
-; CGP-NEXT: v_mul_lo_u32 v14, v15, v1
-; CGP-NEXT: v_mul_hi_u32 v16, v15, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v18, v0
-; CGP-NEXT: v_mov_b32_e32 v4, 0x1000
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v13, v16
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v16, v18, v1
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_mul_hi_u32 v14, v15, v1
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v16, v0
-; CGP-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v14
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v14, vcc, v16, v14
-; CGP-NEXT: v_add_i32_e32 v16, vcc, v0, v13
-; CGP-NEXT: v_mul_hi_u32 v17, v18, v1
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v4, v16, 0
-; CGP-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v13, vcc, v14, v13
-; CGP-NEXT: v_add_i32_e32 v17, vcc, v17, v13
-; CGP-NEXT: v_mad_u64_u32 v[13:14], s[4:5], v4, v17, v[1:2]
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v15, v0
-; CGP-NEXT: v_subb_u32_e64 v1, s[4:5], v18, v13, vcc
-; CGP-NEXT: v_sub_i32_e64 v13, s[4:5], v18, v13
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v0, v4
-; CGP-NEXT: v_cndmask_b32_e64 v14, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v1
-; CGP-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v13, vcc
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v0, v4
-; CGP-NEXT: v_subbrev_u32_e32 v13, vcc, 0, v1, vcc
-; CGP-NEXT: v_add_i32_e32 v15, vcc, 1, v16
-; CGP-NEXT: v_addc_u32_e32 v18, vcc, 0, v17, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v0, v4
-; CGP-NEXT: v_mov_b32_e32 v0, v5
-; CGP-NEXT: v_cndmask_b32_e64 v14, -1, v14, s[4:5]
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v9, v[0:1]
-; CGP-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v13
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], -1, v8, v[0:1]
-; CGP-NEXT: v_cndmask_b32_e32 v5, -1, v19, vcc
-; CGP-NEXT: v_add_i32_e32 v1, vcc, 1, v15
-; CGP-NEXT: v_mul_lo_u32 v19, v8, v0
-; CGP-NEXT: v_addc_u32_e32 v13, vcc, 0, v18, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; CGP-NEXT: v_cndmask_b32_e32 v5, v15, v1, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v13, v18, v13, vcc
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v19
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v1, v11
-; CGP-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v11, v9, v0
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; CGP-NEXT: v_mul_hi_u32 v10, v8, v0
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v11, v12
-; CGP-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v11, vcc, v12, v11
-; CGP-NEXT: v_mul_hi_u32 v0, v9, v0
-; CGP-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; CGP-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v10, vcc, v11, v10
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v10
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v1
-; CGP-NEXT: v_addc_u32_e32 v9, vcc, v9, v0, vcc
-; CGP-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v8, 0
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
-; CGP-NEXT: v_cndmask_b32_e32 v5, v16, v5, vcc
-; CGP-NEXT: v_xor_b32_e32 v11, v5, v7
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v6, v9, v[1:2]
-; CGP-NEXT: v_cndmask_b32_e32 v10, v17, v13, vcc
-; CGP-NEXT: v_xor_b32_e32 v1, v10, v7
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], -1, v8, v[5:6]
-; CGP-NEXT: v_ashrrev_i32_e32 v10, 31, v3
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_addc_u32_e32 v3, vcc, v3, v10, vcc
-; CGP-NEXT: v_xor_b32_e32 v12, v2, v10
-; CGP-NEXT: v_mul_lo_u32 v2, v9, v0
-; CGP-NEXT: v_mul_lo_u32 v6, v8, v5
-; CGP-NEXT: v_xor_b32_e32 v13, v3, v10
-; CGP-NEXT: v_mul_hi_u32 v3, v8, v0
-; CGP-NEXT: v_mul_hi_u32 v0, v9, v0
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v6
-; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v3
-; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v3, v9, v5
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v6, v2
-; CGP-NEXT: v_mul_hi_u32 v6, v8, v5
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v3, v0
-; CGP-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v6
-; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT: v_mul_hi_u32 v5, v9, v5
-; CGP-NEXT: v_add_i32_e32 v0, vcc, v0, v2
-; CGP-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v3, v2
-; CGP-NEXT: v_add_i32_e32 v2, vcc, v5, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v8, v0
-; CGP-NEXT: v_addc_u32_e32 v2, vcc, v9, v2, vcc
-; CGP-NEXT: v_mul_lo_u32 v5, v13, v3
-; CGP-NEXT: v_mul_lo_u32 v6, v12, v2
-; CGP-NEXT: v_sub_i32_e32 v0, vcc, v11, v7
-; CGP-NEXT: v_subb_u32_e32 v1, vcc, v1, v7, vcc
-; CGP-NEXT: v_mul_hi_u32 v7, v12, v3
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v6
-; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v5, v7
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_mul_lo_u32 v7, v13, v2
-; CGP-NEXT: v_mul_hi_u32 v3, v13, v3
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_mul_hi_u32 v6, v12, v2
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v7, v3
-; CGP-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v3, vcc, v3, v6
-; CGP-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; CGP-NEXT: v_add_i32_e32 v7, vcc, v3, v5
-; CGP-NEXT: v_mul_hi_u32 v8, v13, v2
-; CGP-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v4, v7, 0
-; CGP-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; CGP-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; CGP-NEXT: v_add_i32_e32 v8, vcc, v8, v5
-; CGP-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, v[3:4]
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v12, v2
-; CGP-NEXT: v_subb_u32_e64 v3, s[4:5], v13, v5, vcc
-; CGP-NEXT: v_sub_i32_e64 v5, s[4:5], v13, v5
-; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
-; CGP-NEXT: v_cmp_ge_u32_e64 s[4:5], v2, v4
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v4
-; CGP-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; CGP-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3
-; CGP-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
-; CGP-NEXT: v_cndmask_b32_e64 v3, -1, v6, s[4:5]
-; CGP-NEXT: v_add_i32_e32 v6, vcc, 1, v7
-; CGP-NEXT: v_addc_u32_e32 v9, vcc, 0, v8, vcc
-; CGP-NEXT: v_cmp_ge_u32_e32 vcc, v2, v4
-; CGP-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
-; CGP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v5
-; CGP-NEXT: v_cndmask_b32_e32 v2, -1, v2, vcc
-; CGP-NEXT: v_add_i32_e32 v4, vcc, 1, v6
-; CGP-NEXT: v_addc_u32_e32 v5, vcc, 0, v9, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; CGP-NEXT: v_cndmask_b32_e32 v2, v6, v4, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v4, v9, v5, vcc
-; CGP-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; CGP-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
-; CGP-NEXT: v_cndmask_b32_e32 v3, v8, v4, vcc
-; CGP-NEXT: v_xor_b32_e32 v2, v2, v10
-; CGP-NEXT: v_xor_b32_e32 v3, v3, v10
-; CGP-NEXT: v_sub_i32_e32 v2, vcc, v2, v10
-; CGP-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; CGP-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; CGP-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; CGP-NEXT: v_lshrrev_b32_e32 v4, 20, v4
+; CGP-NEXT: v_add_i32_e32 v2, vcc, v2, v4
+; CGP-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc
+; CGP-NEXT: v_ashr_i64 v[0:1], v[0:1], 12
+; CGP-NEXT: v_ashr_i64 v[2:3], v[2:3], 12
; CGP-NEXT: s_setpc_b64 s[30:31]
%result = sdiv <2 x i64> %num, <i64 4096, i64 4096>
ret <2 x i64> %result
@@ -3091,253 +2534,252 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, 0, v1
-; GISEL-NEXT: v_addc_u32_e64 v1, s[4:5], 0, 0, vcc
-; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v3
+; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], 0, 0
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, 0, v1
; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v1
-; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v3
-; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v1, vcc
-; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v3
+; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1
+; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc
+; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT: v_trunc_f32_e32 v7, v5
-; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7
-; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v7
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6]
-; GISEL-NEXT: v_mul_lo_u32 v5, v12, v4
-; GISEL-NEXT: v_mul_hi_u32 v13, v9, v4
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4
-; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v12, v7
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
-; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8
-; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5
+; GISEL-NEXT: v_trunc_f32_e32 v9, v7
+; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9
+; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v9
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
+; GISEL-NEXT: v_mov_b32_e32 v5, v8
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
+; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7
+; GISEL-NEXT: v_mul_hi_u32 v14, v10, v7
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
+; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7
+; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT: v_mul_lo_u32 v14, v13, v8
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v4
-; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6]
-; GISEL-NEXT: v_mul_lo_u32 v5, v12, v4
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, 0, v0
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v4
-; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, vcc
-; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7
-; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v5
+; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v7, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
+; GISEL-NEXT: v_mov_b32_e32 v5, v8
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
+; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, 0, v0
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
+; GISEL-NEXT: v_mul_hi_u32 v0, v10, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7
+; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8
+; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2
+; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v6
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v12, v7
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT: v_mul_lo_u32 v5, v13, v8
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v11, v0
-; GISEL-NEXT: v_mul_lo_u32 v7, v10, v4
-; GISEL-NEXT: v_mul_hi_u32 v8, v10, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0
-; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2
+; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v8, v11, v4
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc
+; GISEL-NEXT: v_mul_lo_u32 v7, v3, v0
+; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5
+; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v3, v0
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v5
-; GISEL-NEXT: v_mul_hi_u32 v8, v11, v4
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v9, 0
+; GISEL-NEXT: v_mul_hi_u32 v8, v11, v5
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v8
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v7
+; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v10, 0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v3, v0, v[5:6]
-; GISEL-NEXT: v_and_b32_e32 v2, 0xffffff, v6
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v1, v9, v[7:8]
-; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v10, v4
-; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], v11, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v11, v5
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
+; GISEL-NEXT: v_mov_b32_e32 v5, v8
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v0, v[5:6]
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v10, v[8:9]
+; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v11, v7
+; GISEL-NEXT: v_subb_u32_e64 v7, s[4:5], v3, v5, vcc
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v3
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], 0, v2
-; GISEL-NEXT: v_addc_u32_e64 v2, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v4
-; GISEL-NEXT: v_cvt_f32_u32_e32 v13, v2
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v1
-; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v1, vcc
-; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v13
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v7, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5]
-; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v6, v3
-; GISEL-NEXT: v_subbrev_u32_e32 v11, vcc, 0, v5, vcc
-; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v7
-; GISEL-NEXT: v_mul_f32_e32 v6, 0x2f800000, v5
-; GISEL-NEXT: v_trunc_f32_e32 v6, v6
-; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v6
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], 0, v2
+; GISEL-NEXT: v_cvt_f32_u32_e32 v11, v2
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v7, v3
+; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc
+; GISEL-NEXT: v_mac_f32_e32 v11, 0x4f800000, v4
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v11
+; GISEL-NEXT: v_cndmask_b32_e64 v7, v8, v9, s[4:5]
+; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v6, v1
+; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
+; GISEL-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v5, vcc
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
+; GISEL-NEXT: v_trunc_f32_e32 v5, v5
+; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v11, v4
+; GISEL-NEXT: v_sub_i32_e32 v14, vcc, 0, v2
; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v5
-; GISEL-NEXT: v_sub_i32_e32 v15, vcc, 0, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v6
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v13, 0
-; GISEL-NEXT: v_subb_u32_e32 v16, vcc, 0, v2, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v14, v[6:7]
-; GISEL-NEXT: v_add_i32_e32 v17, vcc, 1, v9
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v13, v[6:7]
-; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v0, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v11, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v10, v3
-; GISEL-NEXT: v_mul_lo_u32 v7, v14, v5
-; GISEL-NEXT: v_mul_lo_u32 v10, v13, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v11, v1
-; GISEL-NEXT: v_mul_hi_u32 v1, v13, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v3, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v7, v1
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v11, 0
+; GISEL-NEXT: v_subb_u32_e32 v15, vcc, 0, v3, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v13, v[5:6]
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, 1, v10
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v11, v[5:6]
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, 0, v0, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v9, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v8, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v6, v13, v4
+; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v9, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v9, v18, v1, vcc
+; GISEL-NEXT: v_mul_hi_u32 v1, v11, v4
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v6, v1
; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v7, v14, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v14, v5
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v10, v1
-; GISEL-NEXT: v_mul_hi_u32 v10, v13, v6
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v10
-; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
-; GISEL-NEXT: v_mul_hi_u32 v6, v14, v6
-; GISEL-NEXT: v_add_i32_e32 v1, vcc, v5, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v13, v1
-; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v14, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v10, 0
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v17
-; GISEL-NEXT: v_mov_b32_e32 v1, v6
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v15, v11, v[1:2]
-; GISEL-NEXT: v_addc_u32_e32 v14, vcc, 0, v18, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v10, v[6:7]
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v13, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v18, v14, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
-; GISEL-NEXT: v_mul_lo_u32 v7, v11, v5
-; GISEL-NEXT: v_mul_lo_u32 v8, v10, v6
-; GISEL-NEXT: v_mul_hi_u32 v13, v10, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
-; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], 0, v12
-; GISEL-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v7, v13
+; GISEL-NEXT: v_mul_lo_u32 v6, v13, v5
+; GISEL-NEXT: v_mul_hi_u32 v4, v13, v4
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v8, v1
+; GISEL-NEXT: v_mul_hi_u32 v8, v11, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
+; GISEL-NEXT: v_mul_hi_u32 v5, v13, v5
+; GISEL-NEXT: v_add_i32_e32 v1, vcc, v4, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, v11, v1
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, v13, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v14, v8, 0
+; GISEL-NEXT: v_add_i32_e32 v13, vcc, 1, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v5
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v14, v11, v[1:2]
+; GISEL-NEXT: v_addc_u32_e32 v18, vcc, 0, v17, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v8, v[5:6]
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v16, v13, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v9, v17, v18, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
+; GISEL-NEXT: v_mul_lo_u32 v6, v11, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v8, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v10, v1, vcc
+; GISEL-NEXT: v_add_i32_e64 v10, s[4:5], 0, v12
+; GISEL-NEXT: v_mul_hi_u32 v12, v8, v4
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v13, v11, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5
-; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7
-; GISEL-NEXT: v_mul_hi_u32 v8, v10, v6
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v13, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], v13, v8
-; GISEL-NEXT: v_mul_hi_u32 v6, v11, v6
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v12
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v12, v11, v5
+; GISEL-NEXT: v_mul_hi_u32 v4, v11, v4
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6
+; GISEL-NEXT: v_mul_hi_u32 v7, v8, v5
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v12, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v8, v7
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v10, v5
-; GISEL-NEXT: v_addc_u32_e64 v6, s[4:5], v11, v6, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v7, v12, v5
-; GISEL-NEXT: v_mul_lo_u32 v8, v9, v6
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v0, v3, vcc
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v5
-; GISEL-NEXT: v_mul_hi_u32 v5, v12, v5
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v7, v12, v6
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_mul_hi_u32 v8, v9, v6
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v12, v7
+; GISEL-NEXT: v_mul_hi_u32 v5, v11, v5
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v8, v4
+; GISEL-NEXT: v_addc_u32_e64 v5, s[4:5], v11, v5, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v6, v3, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v10, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v0, v9, vcc
+; GISEL-NEXT: v_mul_hi_u32 v0, v10, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v5, v0
-; GISEL-NEXT: v_mul_hi_u32 v10, v12, v6
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v4, v8, 0
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v6, v3, v5
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v0
-; GISEL-NEXT: v_mov_b32_e32 v0, v6
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v10, v[0:1]
+; GISEL-NEXT: v_mul_hi_u32 v7, v10, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v4, v0
+; GISEL-NEXT: v_mul_hi_u32 v9, v3, v5
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v7, 0
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v6, v0
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, v5
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v9, v[0:1]
; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v8, v[6:7]
-; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v3, vcc
-; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v9, v5
-; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v12, v6, vcc
-; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v12, v6
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v2
-; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v6, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v4
-; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v3, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v2
-; GISEL-NEXT: v_subbrev_u32_e32 v6, vcc, 0, v6, vcc
-; GISEL-NEXT: v_cndmask_b32_e64 v5, v7, v9, s[4:5]
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, 1, v8
-; GISEL-NEXT: v_addc_u32_e32 v9, vcc, 0, v10, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v6, v2
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[5:6]
+; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v8, vcc
+; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v10, v4
+; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v5, vcc
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
+; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2
+; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v4, v2
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
+; GISEL-NEXT: v_subbrev_u32_e32 v5, vcc, 0, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v6, v8, v10, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v7
+; GISEL-NEXT: v_addc_u32_e32 v10, vcc, 0, v9, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v5, v3
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v3, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v3, 0, -1, vcc
-; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v6, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v3, vcc
-; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v7
-; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v9, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e32 vcc, v4, v2
+; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, v5, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v11, v2, vcc
+; GISEL-NEXT: v_add_i32_e32 v3, vcc, 1, v8
+; GISEL-NEXT: v_addc_u32_e32 v4, vcc, 0, v10, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v3, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v4, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v5
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v10, v4, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v7, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0, v2
; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
@@ -3399,3 +2841,24 @@ define <2 x i64> @v_sdiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
%result = sdiv <2 x i64> %num.mask, %den.mask
ret <2 x i64> %result
}
+
+define i64 @v_sdiv_i64_exact(i64 %num) {
+; CHECK-LABEL: v_sdiv_i64_exact:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_ashr_i64 v[0:1], v[0:1], 12
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %result = sdiv exact i64 %num, 4096
+ ret i64 %result
+}
+
+define <2 x i64> @v_sdiv_v2i64_exact(<2 x i64> %num) {
+; CHECK-LABEL: v_sdiv_v2i64_exact:
+; CHECK: ; %bb.0:
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_ashr_i64 v[0:1], v[0:1], 12
+; CHECK-NEXT: v_ashr_i64 v[2:3], v[2:3], 10
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+ %result = sdiv exact <2 x i64> %num, <i64 4096, i64 1024>
+ ret <2 x i64> %result
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
index c455b24..83ebc84 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/srem.i64.ll
@@ -3034,253 +3034,251 @@ define <2 x i64> @v_srem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL: ; %bb.0:
; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GISEL-NEXT: v_and_b32_e32 v1, 0xffffff, v4
+; GISEL-NEXT: v_add_i32_e64 v3, s[4:5], 0, 0
; GISEL-NEXT: v_add_i32_e32 v1, vcc, 0, v1
-; GISEL-NEXT: v_addc_u32_e64 v3, s[4:5], 0, 0, vcc
-; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v1
-; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v3
-; GISEL-NEXT: v_sub_i32_e32 v10, vcc, 0, v1
-; GISEL-NEXT: v_subb_u32_e32 v11, vcc, 0, v3, vcc
-; GISEL-NEXT: v_mac_f32_e32 v4, 0x4f800000, v5
-; GISEL-NEXT: v_rcp_iflag_f32_e32 v4, v4
+; GISEL-NEXT: v_cvt_f32_u32_e32 v5, v1
+; GISEL-NEXT: v_cvt_f32_u32_e32 v4, v3
+; GISEL-NEXT: v_sub_i32_e32 v11, vcc, 0, v1
+; GISEL-NEXT: v_subb_u32_e32 v12, vcc, 0, v3, vcc
+; GISEL-NEXT: v_mac_f32_e32 v5, 0x4f800000, v4
+; GISEL-NEXT: v_rcp_iflag_f32_e32 v5, v5
; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v0
-; GISEL-NEXT: v_mul_f32_e32 v4, 0x5f7ffffc, v4
-; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v4
-; GISEL-NEXT: v_trunc_f32_e32 v7, v5
-; GISEL-NEXT: v_mac_f32_e32 v4, 0xcf800000, v7
-; GISEL-NEXT: v_cvt_u32_f32_e32 v9, v4
-; GISEL-NEXT: v_cvt_u32_f32_e32 v12, v7
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6]
-; GISEL-NEXT: v_mul_lo_u32 v5, v12, v4
-; GISEL-NEXT: v_mul_hi_u32 v13, v9, v4
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4
-; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v13
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v12, v7
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
-; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v13, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v13, v8
-; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v5
+; GISEL-NEXT: v_mul_f32_e32 v5, 0x5f7ffffc, v5
+; GISEL-NEXT: v_mul_f32_e32 v7, 0x2f800000, v5
+; GISEL-NEXT: v_trunc_f32_e32 v9, v7
+; GISEL-NEXT: v_mac_f32_e32 v5, 0xcf800000, v9
+; GISEL-NEXT: v_cvt_u32_f32_e32 v10, v5
+; GISEL-NEXT: v_cvt_u32_f32_e32 v13, v9
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
+; GISEL-NEXT: v_mov_b32_e32 v5, v8
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
+; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7
+; GISEL-NEXT: v_mul_hi_u32 v14, v10, v7
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
+; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7
+; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v14
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT: v_mul_lo_u32 v14, v13, v8
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
+; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v14, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v14, v9
+; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v4
-; GISEL-NEXT: v_addc_u32_e32 v12, vcc, v12, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v10, v9, 0
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v10, v12, v[5:6]
-; GISEL-NEXT: v_mul_lo_u32 v5, v12, v4
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, 0, v0
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v9, v[7:8]
-; GISEL-NEXT: v_mul_hi_u32 v0, v9, v4
-; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, 0, vcc
-; GISEL-NEXT: v_mul_lo_u32 v8, v9, v7
-; GISEL-NEXT: v_mul_hi_u32 v4, v12, v4
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v5
+; GISEL-NEXT: v_addc_u32_e32 v13, vcc, v13, v7, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v11, v10, 0
+; GISEL-NEXT: v_mov_b32_e32 v5, v8
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v11, v13, v[5:6]
+; GISEL-NEXT: v_mul_lo_u32 v5, v13, v7
+; GISEL-NEXT: v_add_i32_e32 v11, vcc, 0, v0
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v12, v10, v[8:9]
+; GISEL-NEXT: v_mul_hi_u32 v0, v10, v7
+; GISEL-NEXT: v_mul_hi_u32 v7, v13, v7
+; GISEL-NEXT: v_mul_lo_u32 v9, v10, v8
+; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v12, v7
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
-; GISEL-NEXT: v_mul_hi_u32 v8, v9, v7
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v8
-; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
-; GISEL-NEXT: v_mul_hi_u32 v7, v12, v7
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4
-; GISEL-NEXT: v_add_i32_e32 v4, vcc, v7, v4
+; GISEL-NEXT: v_mul_lo_u32 v5, v13, v8
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
-; GISEL-NEXT: v_addc_u32_e32 v4, vcc, v12, v4, vcc
-; GISEL-NEXT: v_mul_lo_u32 v5, v11, v0
-; GISEL-NEXT: v_mul_lo_u32 v7, v10, v4
-; GISEL-NEXT: v_mul_hi_u32 v8, v10, v0
-; GISEL-NEXT: v_mul_hi_u32 v0, v11, v0
-; GISEL-NEXT: v_and_b32_e32 v12, 0xffffff, v2
+; GISEL-NEXT: v_mul_hi_u32 v9, v10, v8
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v8
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v9
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
+; GISEL-NEXT: v_mul_hi_u32 v8, v13, v8
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v8, v11, v4
; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
-; GISEL-NEXT: v_mul_hi_u32 v7, v10, v4
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v8, v0
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v8, v5
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v10, v0
+; GISEL-NEXT: v_addc_u32_e32 v5, vcc, v13, v5, vcc
+; GISEL-NEXT: v_mul_lo_u32 v7, v3, v0
+; GISEL-NEXT: v_mul_lo_u32 v8, v11, v5
+; GISEL-NEXT: v_mul_hi_u32 v9, v11, v0
+; GISEL-NEXT: v_mul_hi_u32 v0, v3, v0
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v7
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v9
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v9, v3, v5
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v7
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v0, v5
-; GISEL-NEXT: v_mul_hi_u32 v8, v11, v4
-; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v1, v9, 0
+; GISEL-NEXT: v_mul_hi_u32 v8, v11, v5
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, v9, v8
+; GISEL-NEXT: v_add_i32_e32 v10, vcc, v0, v7
+; GISEL-NEXT: v_mul_hi_u32 v5, v3, v5
+; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v10, 0
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v7, v0
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v8, v0
-; GISEL-NEXT: v_mov_b32_e32 v0, v5
-; GISEL-NEXT: v_mad_u64_u32 v[7:8], s[4:5], v1, v7, v[0:1]
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v9, v0
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, v8
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v1, v5, v[0:1]
; GISEL-NEXT: v_and_b32_e32 v0, 0xffffff, v6
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v9, v[7:8]
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v10, v4
-; GISEL-NEXT: v_subb_u32_e64 v9, s[4:5], v11, v5, vcc
-; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v11, v5
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3
+; GISEL-NEXT: v_sub_i32_e32 v7, vcc, v11, v7
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v10, v[8:9]
+; GISEL-NEXT: v_subb_u32_e64 v8, s[4:5], v3, v5, vcc
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v3
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v1
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v7, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, -1, s[4:5]
; GISEL-NEXT: v_add_i32_e64 v2, s[4:5], 0, v0
-; GISEL-NEXT: v_addc_u32_e64 v4, s[4:5], 0, 0, s[4:5]
; GISEL-NEXT: v_cvt_f32_u32_e32 v0, v2
-; GISEL-NEXT: v_cvt_f32_u32_e32 v10, v4
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3
-; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v5, v3, vcc
-; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v10
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v8, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v9, v6, v9, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v5, v3, vcc
+; GISEL-NEXT: v_mac_f32_e32 v0, 0x4f800000, v4
; GISEL-NEXT: v_rcp_iflag_f32_e32 v0, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v11, v6, v7, s[4:5]
-; GISEL-NEXT: v_sub_i32_e32 v10, vcc, v8, v1
+; GISEL-NEXT: v_sub_i32_e32 v11, vcc, v7, v1
+; GISEL-NEXT: v_subbrev_u32_e64 v13, s[4:5], 0, v10, vcc
; GISEL-NEXT: v_mul_f32_e32 v0, 0x5f7ffffc, v0
-; GISEL-NEXT: v_mul_f32_e32 v5, 0x2f800000, v0
-; GISEL-NEXT: v_trunc_f32_e32 v7, v5
-; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v7
-; GISEL-NEXT: v_cvt_u32_f32_e32 v15, v0
-; GISEL-NEXT: v_subbrev_u32_e64 v14, s[4:5], 0, v13, vcc
-; GISEL-NEXT: v_sub_i32_e64 v16, s[4:5], 0, v2
-; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, v4, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v15, 0
-; GISEL-NEXT: v_cvt_u32_f32_e32 v18, v7
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v14, v3
-; GISEL-NEXT: v_mov_b32_e32 v0, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, -1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v16, v18, v[0:1]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v10, v1
+; GISEL-NEXT: v_mul_f32_e32 v4, 0x2f800000, v0
+; GISEL-NEXT: v_trunc_f32_e32 v6, v4
+; GISEL-NEXT: v_mac_f32_e32 v0, 0xcf800000, v6
+; GISEL-NEXT: v_cvt_u32_f32_e32 v14, v0
+; GISEL-NEXT: v_sub_i32_e64 v15, s[4:5], 0, v2
+; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, v3, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v14, 0
+; GISEL-NEXT: v_cvt_u32_f32_e32 v17, v6
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v13, v3
+; GISEL-NEXT: v_mov_b32_e32 v0, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, -1, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v15, v17, v[0:1]
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v11, v1
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[4:5]
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v17, v15, v[6:7]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v14, v3
-; GISEL-NEXT: v_cndmask_b32_e64 v7, v19, v0, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v0, v18, v5
-; GISEL-NEXT: v_mul_lo_u32 v19, v15, v6
-; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v13, v3, vcc
-; GISEL-NEXT: v_mul_hi_u32 v13, v15, v5
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v14, v[5:6]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v13, v3
+; GISEL-NEXT: v_cndmask_b32_e64 v6, v18, v0, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v0, v17, v4
+; GISEL-NEXT: v_mul_lo_u32 v18, v14, v5
+; GISEL-NEXT: v_mul_hi_u32 v19, v14, v4
+; GISEL-NEXT: v_subb_u32_e32 v10, vcc, v10, v3, vcc
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v19
-; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v0, v13
; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v13, v18, v6
-; GISEL-NEXT: v_mul_hi_u32 v5, v18, v5
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v19, v0
-; GISEL-NEXT: v_mul_hi_u32 v19, v15, v6
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v19
+; GISEL-NEXT: v_mul_lo_u32 v19, v17, v5
+; GISEL-NEXT: v_mul_hi_u32 v4, v17, v4
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v18, v0
+; GISEL-NEXT: v_mul_hi_u32 v18, v14, v5
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v19, v4
; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v13, v19
-; GISEL-NEXT: v_mul_hi_u32 v6, v18, v6
-; GISEL-NEXT: v_add_i32_e32 v0, vcc, v5, v0
-; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v13, v5
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v6, v5
-; GISEL-NEXT: v_add_i32_e32 v13, vcc, v15, v0
-; GISEL-NEXT: v_addc_u32_e32 v15, vcc, v18, v5, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v16, v13, 0
-; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v10, v1
-; GISEL-NEXT: v_mov_b32_e32 v0, v6
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v15, v[0:1]
-; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
-; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v17, v13, v[0:1]
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT: v_cndmask_b32_e32 v6, v10, v18, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v14, v3, vcc
-; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11
-; GISEL-NEXT: v_cndmask_b32_e32 v1, v8, v6, vcc
-; GISEL-NEXT: v_mul_lo_u32 v6, v15, v5
-; GISEL-NEXT: v_mul_lo_u32 v7, v13, v0
-; GISEL-NEXT: v_mul_hi_u32 v11, v13, v5
-; GISEL-NEXT: v_add_i32_e64 v8, s[4:5], 0, v12
-; GISEL-NEXT: v_addc_u32_e64 v10, s[4:5], 0, 0, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v6, v11
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v11, v15, v0
-; GISEL-NEXT: v_mul_hi_u32 v5, v15, v5
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6
-; GISEL-NEXT: v_mul_hi_u32 v7, v13, v0
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v11, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, v19, v18
+; GISEL-NEXT: v_mul_hi_u32 v5, v17, v5
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, v4, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v18, v4
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v5, v4
+; GISEL-NEXT: v_add_i32_e32 v14, vcc, v14, v0
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v15, v14, 0
+; GISEL-NEXT: v_sub_i32_e32 v18, vcc, v11, v1
+; GISEL-NEXT: v_mov_b32_e32 v0, v5
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v15, v17, v[0:1]
+; GISEL-NEXT: v_subbrev_u32_e32 v10, vcc, 0, v10, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v16, v14, v[0:1]
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v11, v18, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v13, v10, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v9
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v7, v5, vcc
+; GISEL-NEXT: v_mul_lo_u32 v5, v17, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v14, v0
+; GISEL-NEXT: v_mul_hi_u32 v10, v14, v4
+; GISEL-NEXT: v_add_i32_e64 v9, s[4:5], 0, v12
; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v11, v7
-; GISEL-NEXT: v_mul_hi_u32 v0, v15, v0
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[4:5]
-; GISEL-NEXT: v_add_i32_e64 v6, s[4:5], v7, v6
-; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v6
-; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v13, v5
-; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v15, v0, s[4:5]
-; GISEL-NEXT: v_mul_lo_u32 v6, v10, v5
-; GISEL-NEXT: v_mul_lo_u32 v7, v8, v0
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
-; GISEL-NEXT: v_mul_hi_u32 v9, v8, v5
-; GISEL-NEXT: v_mul_hi_u32 v5, v10, v5
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v5, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v10, v17, v0
+; GISEL-NEXT: v_mul_hi_u32 v4, v17, v4
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v7, v14, v0
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v10, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v7
+; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v7, s[4:5], v10, v7
+; GISEL-NEXT: v_mul_hi_u32 v0, v17, v0
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v4, v5
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e64 v5, s[4:5], v7, v5
+; GISEL-NEXT: v_add_i32_e64 v0, s[4:5], v0, v5
+; GISEL-NEXT: v_add_i32_e64 v4, s[4:5], v14, v4
+; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v17, v0, s[4:5]
+; GISEL-NEXT: v_mul_lo_u32 v5, v3, v4
+; GISEL-NEXT: v_mul_lo_u32 v7, v9, v0
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v6, vcc
+; GISEL-NEXT: v_mul_hi_u32 v6, v9, v4
+; GISEL-NEXT: v_mul_hi_u32 v4, v3, v4
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v9
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v6
+; GISEL-NEXT: v_cndmask_b32_e64 v5, 0, 1, vcc
+; GISEL-NEXT: v_mul_lo_u32 v6, v3, v0
+; GISEL-NEXT: v_add_i32_e32 v5, vcc, v7, v5
+; GISEL-NEXT: v_mul_hi_u32 v7, v9, v0
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v6, v4
; GISEL-NEXT: v_cndmask_b32_e64 v6, 0, 1, vcc
-; GISEL-NEXT: v_mul_lo_u32 v9, v10, v0
-; GISEL-NEXT: v_add_i32_e32 v6, vcc, v7, v6
-; GISEL-NEXT: v_mul_hi_u32 v7, v8, v0
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v9, v5
-; GISEL-NEXT: v_cndmask_b32_e64 v9, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v5, vcc, v5, v7
+; GISEL-NEXT: v_add_i32_e32 v4, vcc, v4, v7
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v9, v7
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v5, v6
-; GISEL-NEXT: v_mul_hi_u32 v0, v10, v0
-; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v9, 0
-; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v11
-; GISEL-NEXT: v_add_i32_e32 v7, vcc, v0, v7
-; GISEL-NEXT: v_mov_b32_e32 v0, v6
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v2, v7, v[0:1]
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v7
+; GISEL-NEXT: v_add_i32_e32 v7, vcc, v4, v5
+; GISEL-NEXT: v_mul_hi_u32 v0, v3, v0
+; GISEL-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v2, v7, 0
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v10
+; GISEL-NEXT: v_add_i32_e32 v6, vcc, v0, v6
+; GISEL-NEXT: v_mov_b32_e32 v0, v5
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v2, v6, v[0:1]
; GISEL-NEXT: v_subrev_i32_e32 v0, vcc, 0, v1
-; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[4:5], v4, v9, v[6:7]
-; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v3, vcc
-; GISEL-NEXT: v_sub_i32_e32 v3, vcc, v8, v5
-; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v10, v6, vcc
-; GISEL-NEXT: v_sub_i32_e64 v6, s[4:5], v10, v6
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v5, v4
+; GISEL-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v3, v7, v[5:6]
+; GISEL-NEXT: v_subbrev_u32_e32 v1, vcc, 0, v8, vcc
+; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v9, v4
+; GISEL-NEXT: v_subb_u32_e64 v6, s[4:5], v3, v5, vcc
+; GISEL-NEXT: v_sub_i32_e64 v5, s[4:5], v3, v5
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v6, v3
; GISEL-NEXT: v_cndmask_b32_e64 v7, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v3, v2
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v4, v2
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v5, v4
-; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v6, v4, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v6, v3
+; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v5, v3, vcc
; GISEL-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[4:5]
-; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v3, v2
-; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v6, vcc
-; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v4
+; GISEL-NEXT: v_sub_i32_e32 v8, vcc, v4, v2
+; GISEL-NEXT: v_subbrev_u32_e64 v9, s[4:5], 0, v5, vcc
+; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v9, v3
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, -1, s[4:5]
; GISEL-NEXT: v_cmp_ge_u32_e64 s[4:5], v8, v2
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, -1, s[4:5]
-; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v4
-; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v6, v4, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], v9, v3
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v5, v3, vcc
; GISEL-NEXT: v_sub_i32_e32 v2, vcc, v8, v2
; GISEL-NEXT: v_cndmask_b32_e64 v10, v10, v11, s[4:5]
-; GISEL-NEXT: v_subbrev_u32_e32 v4, vcc, 0, v4, vcc
+; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
; GISEL-NEXT: v_cndmask_b32_e32 v2, v8, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v9, v3, vcc
; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v7
-; GISEL-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
-; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v6, v3, vcc
; GISEL-NEXT: v_subrev_i32_e32 v2, vcc, 0, v2
; GISEL-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v3, vcc
; GISEL-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
index 61e1e67..320dfbb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll
@@ -4142,11 +4142,11 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
; GFX9-NEXT: v_lshlrev_b64 v[2:3], 16, v[2:3]
; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2
; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4162,7 +4162,7 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, v[4:5], v[0:1]
-; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6
; GFX10-NEXT: s_xor_b32 vcc_lo, vcc_lo, s4
; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
@@ -4179,7 +4179,7 @@ define i48 @v_ssubsat_i48(i48 %lhs, i48 %rhs) {
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, 0, v[2:3]
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, v[4:5], v[0:1]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6
; GFX11-NEXT: s_xor_b32 vcc_lo, vcc_lo, s0
; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4202,7 +4202,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
; GFX6-NEXT: s_ashr_i32 s2, s7, 31
; GFX6-NEXT: s_ashr_i32 s5, s7, 15
-; GFX6-NEXT: s_add_u32 s2, s2, 0xffff8000
+; GFX6-NEXT: s_addk_i32 s2, 0x8000
; GFX6-NEXT: v_mov_b32_e32 v0, s5
; GFX6-NEXT: v_mov_b32_e32 v1, s2
; GFX6-NEXT: v_mov_b32_e32 v2, s4
@@ -4227,7 +4227,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
; GFX8-NEXT: s_ashr_i32 s2, s7, 31
; GFX8-NEXT: s_ashr_i32 s5, s7, 15
-; GFX8-NEXT: s_add_u32 s2, s2, 0xffff8000
+; GFX8-NEXT: s_addk_i32 s2, 0x8000
; GFX8-NEXT: v_mov_b32_e32 v0, s5
; GFX8-NEXT: v_mov_b32_e32 v1, s2
; GFX8-NEXT: v_mov_b32_e32 v2, s4
@@ -4250,7 +4250,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
; GFX9-NEXT: s_ashr_i32 s2, s5, 31
-; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -4274,7 +4274,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: s_ashr_i32 s2, s5, 31
-; GFX10-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX10-NEXT: s_xor_b32 s0, s1, s0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4293,7 +4293,7 @@ define amdgpu_ps i48 @s_ssubsat_i48(i48 inreg %lhs, i48 inreg %rhs) {
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0
; GFX11-NEXT: s_ashr_i32 s2, s5, 31
-; GFX11-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX11-NEXT: s_xor_b32 s0, s1, s0
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4351,11 +4351,11 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s0, v0
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v1, vcc
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1]
+; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], 0, v[0:1]
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4371,7 +4371,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1]
-; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4388,7 +4388,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_sv(i48 inreg %lhs, i48 %rhs) {
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4442,15 +4442,15 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX9-LABEL: ssubsat_i48_vs:
; GFX9: ; %bb.0:
; GFX9-NEXT: v_lshlrev_b64 v[0:1], 16, v[0:1]
-; GFX9-NEXT: s_lshl_b64 s[2:3], s[0:1], 16
-; GFX9-NEXT: v_mov_b32_e32 v3, s3
-; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s2, v0
+; GFX9-NEXT: s_lshl_b64 s[0:1], s[0:1], 16
+; GFX9-NEXT: v_mov_b32_e32 v3, s1
+; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v0
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], v[2:3], v[0:1]
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[2:3], 0
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
+; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX9-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4466,7 +4466,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4483,7 +4483,7 @@ define amdgpu_ps <2 x float> @ssubsat_i48_vs(i48 %lhs, i48 inreg %rhs) {
; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
; GFX11-NEXT: v_ashrrev_i64 v[0:1], 16, v[0:1]
@@ -4529,11 +4529,11 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v0, v2
; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[0:1]
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[2:3]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[0:1]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[2:3]
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v5
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -4546,7 +4546,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[2:3]
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v6
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6
; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo
@@ -4560,7 +4560,7 @@ define i64 @v_ssubsat_i64(i64 %lhs, i64 %rhs) {
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[2:3]
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v5
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v6
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v6
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1
; GFX11-NEXT: s_setpc_b64 s[30:31]
@@ -4578,7 +4578,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
; GFX6-NEXT: s_ashr_i32 s2, s5, 31
-; GFX6-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX6-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v0, s2
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: v_mov_b32_e32 v2, s4
@@ -4599,7 +4599,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
; GFX8-NEXT: s_ashr_i32 s2, s5, 31
-; GFX8-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX8-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v0, s2
; GFX8-NEXT: v_mov_b32_e32 v1, s3
; GFX8-NEXT: v_mov_b32_e32 v2, s4
@@ -4620,7 +4620,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1]
; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0
; GFX9-NEXT: s_ashr_i32 s2, s5, 31
-; GFX9-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX9-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v0, s2
; GFX9-NEXT: v_mov_b32_e32 v1, s3
; GFX9-NEXT: v_mov_b32_e32 v2, s4
@@ -4641,7 +4641,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0
; GFX10-NEXT: v_mov_b32_e32 v1, s5
; GFX10-NEXT: s_ashr_i32 s2, s5, 31
-; GFX10-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX10-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX10-NEXT: s_xor_b32 s0, s1, s0
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4657,7 +4657,7 @@ define amdgpu_ps i64 @s_ssubsat_i64(i64 inreg %lhs, i64 inreg %rhs) {
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[4:5], s[0:1]
; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[2:3], 0
; GFX11-NEXT: s_ashr_i32 s2, s5, 31
-; GFX11-NEXT: s_add_u32 s3, s2, 0x80000000
+; GFX11-NEXT: s_add_i32 s3, s2, 0x80000000
; GFX11-NEXT: s_xor_b32 s0, s1, s0
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, s0
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s3, s0
@@ -4702,11 +4702,11 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_sub_co_u32_e32 v2, vcc, s0, v0
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v1, vcc
-; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], v[2:3]
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], 0, v[0:1]
+; GFX9-NEXT: v_cmp_gt_i64_e32 vcc, s[0:1], v[2:3]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], 0, v[0:1]
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT: s_xor_b64 vcc, s[2:3], s[0:1]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX9-NEXT: ; return to shader part epilog
@@ -4718,7 +4718,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
; GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1]
-; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4731,7 +4731,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_sv(i64 inreg %lhs, i64 %rhs) {
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3]
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
; GFX11-NEXT: ; return to shader part epilog
@@ -4774,11 +4774,11 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX9-NEXT: v_mov_b32_e32 v3, s1
; GFX9-NEXT: v_subrev_co_u32_e32 v2, vcc, s0, v0
; GFX9-NEXT: v_subb_co_u32_e32 v3, vcc, v1, v3, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[2:3], v[2:3], v[0:1]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[2:3], v[0:1]
; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[0:1], 0
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v3
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, 0x80000000, v0
-; GFX9-NEXT: s_xor_b64 vcc, s[0:1], s[2:3]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[0:1], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
; GFX9-NEXT: ; return to shader part epilog
@@ -4790,7 +4790,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX10-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX10-NEXT: v_add_co_u32 v1, s1, 0x80000000, v4
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo
@@ -4803,7 +4803,7 @@ define amdgpu_ps <2 x float> @ssubsat_i64_vs(i64 %lhs, i64 inreg %rhs) {
; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[0:1], 0
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3
; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v4
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1
; GFX11-NEXT: ; return to shader part epilog
@@ -4866,21 +4866,20 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX9-NEXT: v_sub_co_u32_e32 v8, vcc, v0, v4
; GFX9-NEXT: v_subb_co_u32_e32 v9, vcc, v1, v5, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[8:9], v[0:1]
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[4:5]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[8:9], v[0:1]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[4:5]
; GFX9-NEXT: v_ashrrev_i32_e32 v0, 31, v9
-; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT: v_add_co_u32_e32 v1, vcc, v0, v1
-; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT: v_add_u32_e32 v1, 0x80000000, v0
+; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc
; GFX9-NEXT: v_sub_co_u32_e32 v4, vcc, v2, v6
; GFX9-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v7, vcc
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], v[4:5], v[2:3]
-; GFX9-NEXT: v_cmp_lt_i64_e64 s[6:7], 0, v[6:7]
+; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, v[4:5], v[2:3]
+; GFX9-NEXT: v_cmp_lt_i64_e64 s[4:5], 0, v[6:7]
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v5
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, 0x80000000, v2
-; GFX9-NEXT: s_xor_b64 vcc, s[6:7], s[4:5]
+; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2
+; GFX9-NEXT: s_xor_b64 vcc, s[4:5], vcc
; GFX9-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
; GFX9-NEXT: s_setpc_b64 s[30:31]
@@ -4896,10 +4895,10 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[8:9], v[0:1]
; GFX10-NEXT: v_cmp_lt_i64_e64 s4, 0, v[4:5]
; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v11
-; GFX10-NEXT: v_cmp_lt_i64_e64 s6, 0, v[6:7]
-; GFX10-NEXT: v_add_co_u32 v1, s5, 0x80000000, v12
; GFX10-NEXT: v_cmp_lt_i64_e64 s5, v[10:11], v[2:3]
-; GFX10-NEXT: v_add_co_u32 v3, s7, 0x80000000, v4
+; GFX10-NEXT: v_cmp_lt_i64_e64 s6, 0, v[6:7]
+; GFX10-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12
+; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4
; GFX10-NEXT: s_xor_b32 vcc_lo, s4, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v0, v8, v12, vcc_lo
; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo
@@ -4921,8 +4920,8 @@ define <2 x i64> @v_ssubsat_v2i64(<2 x i64> %lhs, <2 x i64> %rhs) {
; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v11
; GFX11-NEXT: v_cmp_lt_i64_e64 s1, v[10:11], v[2:3]
; GFX11-NEXT: v_cmp_lt_i64_e64 s2, 0, v[6:7]
-; GFX11-NEXT: v_add_co_u32 v1, null, 0x80000000, v12
-; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v4
+; GFX11-NEXT: v_add_nc_u32_e32 v1, 0x80000000, v12
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v4
; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo
; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v12 :: v_dual_cndmask_b32 v1, v9, v1
; GFX11-NEXT: s_xor_b32 vcc_lo, s2, s1
@@ -4942,7 +4941,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
; GFX6-NEXT: s_ashr_i32 s4, s9, 31
-; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: v_mov_b32_e32 v2, s8
@@ -4957,7 +4956,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
; GFX6-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
; GFX6-NEXT: s_ashr_i32 s4, s1, 31
-; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v0, s4
; GFX6-NEXT: v_mov_b32_e32 v1, s5
; GFX6-NEXT: v_mov_b32_e32 v4, s0
@@ -4980,7 +4979,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
; GFX8-NEXT: s_ashr_i32 s4, s9, 31
-; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v2, s8
@@ -4995,7 +4994,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
; GFX8-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
; GFX8-NEXT: s_ashr_i32 s4, s1, 31
-; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v0, s4
; GFX8-NEXT: v_mov_b32_e32 v1, s5
; GFX8-NEXT: v_mov_b32_e32 v4, s0
@@ -5018,7 +5017,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1]
; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0
; GFX9-NEXT: s_ashr_i32 s4, s9, 31
-; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v2, s8
@@ -5033,7 +5032,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1]
; GFX9-NEXT: v_cmp_gt_i64_e64 s[2:3], s[6:7], 0
; GFX9-NEXT: s_ashr_i32 s4, s1, 31
-; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v0, s4
; GFX9-NEXT: v_mov_b32_e32 v1, s5
; GFX9-NEXT: v_mov_b32_e32 v4, s0
@@ -5056,7 +5055,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[4:5], 0
; GFX10-NEXT: s_ashr_i32 s4, s9, 31
; GFX10-NEXT: v_mov_b32_e32 v1, s9
-; GFX10-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX10-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX10-NEXT: s_xor_b32 s8, s1, s0
; GFX10-NEXT: s_sub_u32 s0, s2, s6
; GFX10-NEXT: s_subb_u32 s1, s3, s7
@@ -5067,7 +5066,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8
; GFX10-NEXT: s_ashr_i32 s4, s1, 31
; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8
-; GFX10-NEXT: s_add_u32 s0, s4, 0x80000000
+; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000
; GFX10-NEXT: s_xor_b32 s1, s3, s2
; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1
; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1
@@ -5085,7 +5084,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[8:9], s[0:1]
; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[4:5], 0
; GFX11-NEXT: s_ashr_i32 s4, s9, 31
-; GFX11-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX11-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX11-NEXT: s_xor_b32 s8, s1, s0
; GFX11-NEXT: s_sub_u32 s0, s2, s6
; GFX11-NEXT: s_subb_u32 s1, s3, s7
@@ -5095,7 +5094,7 @@ define amdgpu_ps <2 x i64> @s_ssubsat_v2i64(<2 x i64> inreg %lhs, <2 x i64> inre
; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s4, s8
; GFX11-NEXT: s_ashr_i32 s4, s1, 31
; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s5, s8
-; GFX11-NEXT: s_add_u32 s0, s4, 0x80000000
+; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000
; GFX11-NEXT: s_xor_b32 s1, s3, s2
; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s4, s1
; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s0, s1
@@ -5134,7 +5133,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX6-NEXT: s_ashr_i32 s0, s11, 31
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: v_mov_b32_e32 v2, s8
; GFX6-NEXT: v_mov_b32_e32 v3, s9
@@ -5183,7 +5182,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX8-NEXT: s_ashr_i32 s0, s11, 31
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: v_mov_b32_e32 v2, s8
; GFX8-NEXT: v_mov_b32_e32 v3, s9
@@ -5232,7 +5231,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: s_ashr_i32 s0, s11, 31
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s8
; GFX9-NEXT: v_mov_b32_e32 v3, s9
@@ -5274,7 +5273,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
-; GFX10-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX10-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
; GFX10-NEXT: v_mov_b32_e32 v2, s9
; GFX10-NEXT: v_mov_b32_e32 v3, s11
@@ -5317,7 +5316,7 @@ define amdgpu_ps i128 @s_ssubsat_i128(i128 inreg %lhs, i128 inreg %rhs) {
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
-; GFX11-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX11-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v2 :: v_dual_mov_b32 v2, s9
; GFX11-NEXT: v_mov_b32_e32 v3, s11
; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
@@ -5427,9 +5426,8 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7
; GFX9-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
; GFX9-NEXT: v_xor_b32_e32 v0, v0, v8
-; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
@@ -5456,7 +5454,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2
+; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2
; GFX10-NEXT: v_xor_b32_e32 v0, v0, v8
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
@@ -5484,8 +5482,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_sv(i128 inreg %lhs, i128 %rhs) {
; GFX11-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[2:3]
; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
-; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v2
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v1, v0 :: v_dual_add_nc_u32 v3, 0x80000000, v2
; GFX11-NEXT: v_xor_b32_e32 v0, v0, v8
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
@@ -5594,9 +5591,8 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v7
-; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v2, vcc
@@ -5625,7 +5621,7 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v7
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
-; GFX10-NEXT: v_add_co_u32 v3, s0, 0x80000000, v2
+; GFX10-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2
; GFX10-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo
; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX10-NEXT: v_and_b32_e32 v0, 1, v0
@@ -5652,12 +5648,12 @@ define amdgpu_ps <4 x float> @ssubsat_i128_vs(i128 %lhs, i128 inreg %rhs) {
; GFX11-NEXT: v_cmp_gt_i64_e64 s0, s[2:3], 0
; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc_lo
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[2:3]
-; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7
; GFX11-NEXT: v_cndmask_b32_e64 v9, 0, 1, s0
; GFX11-NEXT: s_and_b32 s0, 1, s4
-; GFX11-NEXT: v_add_co_u32 v3, null, 0x80000000, v2
+; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v7
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0
+; GFX11-NEXT: v_add_nc_u32_e32 v3, 0x80000000, v2
; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc_lo
; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
@@ -5805,9 +5801,8 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: v_ashrrev_i32_e32 v2, 31, v19
-; GFX9-NEXT: v_bfrev_b32_e32 v1, 1
-; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v2, v1
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_add_u32_e32 v3, 0x80000000, v2
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc
@@ -5831,8 +5826,8 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX9-NEXT: v_cndmask_b32_e32 v5, v6, v5, vcc
; GFX9-NEXT: v_xor_b32_e32 v4, v5, v4
; GFX9-NEXT: v_ashrrev_i32_e32 v6, 31, v11
-; GFX9-NEXT: v_add_co_u32_e32 v7, vcc, 0x80000000, v6
; GFX9-NEXT: v_and_b32_e32 v4, 1, v4
+; GFX9-NEXT: v_add_u32_e32 v7, 0x80000000, v6
; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v6, vcc
; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v6, vcc
@@ -5877,18 +5872,18 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX10-NEXT: v_ashrrev_i32_e32 v6, 31, v21
; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
; GFX10-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
-; GFX10-NEXT: v_ashrrev_i32_e32 v3, 31, v19
-; GFX10-NEXT: v_add_co_u32 v7, s5, 0x80000000, v6
+; GFX10-NEXT: v_add_nc_u32_e32 v7, 0x80000000, v6
; GFX10-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo
-; GFX10-NEXT: v_add_co_u32 v4, s4, 0x80000000, v3
; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX10-NEXT: v_xor_b32_e32 v1, v2, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v3, vcc_lo
-; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v3, vcc_lo
-; GFX10-NEXT: v_and_b32_e32 v5, 1, v1
-; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc_lo
+; GFX10-NEXT: v_ashrrev_i32_e32 v2, 31, v19
+; GFX10-NEXT: v_and_b32_e32 v3, 1, v1
+; GFX10-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2
+; GFX10-NEXT: v_cndmask_b32_e32 v0, v16, v2, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc_lo
+; GFX10-NEXT: v_cndmask_b32_e32 v2, v18, v2, vcc_lo
+; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v3
; GFX10-NEXT: v_cndmask_b32_e32 v3, v19, v4, vcc_lo
-; GFX10-NEXT: v_cmp_ne_u32_e64 s4, 0, v5
; GFX10-NEXT: v_cndmask_b32_e64 v4, v8, v6, s4
; GFX10-NEXT: v_cndmask_b32_e64 v5, v9, v6, s4
; GFX10-NEXT: v_cndmask_b32_e64 v6, v20, v6, s4
@@ -5931,18 +5926,16 @@ define <2 x i128> @v_ssubsat_v2i128(<2 x i128> %lhs, <2 x i128> %rhs) {
; GFX11-NEXT: v_ashrrev_i32_e32 v6, 31, v21
; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo
; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, 0, v[14:15]
-; GFX11-NEXT: v_ashrrev_i32_e32 v3, 31, v19
+; GFX11-NEXT: v_dual_cndmask_b32 v2, v5, v4 :: v_dual_add_nc_u32 v7, 0x80000000, v6
+; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1
+; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v19
; GFX11-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX11-NEXT: v_add_co_u32 v7, null, 0x80000000, v6
-; GFX11-NEXT: v_cndmask_b32_e32 v2, v5, v4, vcc_lo
+; GFX11-NEXT: v_add_nc_u32_e32 v4, 0x80000000, v2
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT: v_add_co_u32 v4, null, 0x80000000, v3
-; GFX11-NEXT: v_xor_b32_e32 v1, v2, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, v16, v3, vcc_lo
-; GFX11-NEXT: v_dual_cndmask_b32 v2, v18, v3 :: v_dual_and_b32 v5, 1, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v3, vcc_lo
-; GFX11-NEXT: v_cndmask_b32_e32 v3, v19, v4, vcc_lo
-; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v5
+; GFX11-NEXT: v_dual_cndmask_b32 v0, v16, v2 :: v_dual_and_b32 v3, 1, v1
+; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v3
+; GFX11-NEXT: v_cndmask_b32_e32 v1, v17, v2, vcc_lo
+; GFX11-NEXT: v_dual_cndmask_b32 v2, v18, v2 :: v_dual_cndmask_b32 v3, v19, v4
; GFX11-NEXT: v_cndmask_b32_e64 v4, v8, v6, s0
; GFX11-NEXT: v_cndmask_b32_e64 v5, v9, v6, s0
; GFX11-NEXT: v_cndmask_b32_e64 v6, v20, v6, s0
@@ -5978,7 +5971,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX6-NEXT: s_ashr_i32 s0, s19, 31
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX6-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v1, s0
; GFX6-NEXT: v_mov_b32_e32 v2, s16
; GFX6-NEXT: v_mov_b32_e32 v3, s17
@@ -6013,7 +6006,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX6-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX6-NEXT: s_ashr_i32 s4, s3, 31
; GFX6-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX6-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX6-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX6-NEXT: v_mov_b32_e32 v1, s4
; GFX6-NEXT: v_mov_b32_e32 v2, s0
; GFX6-NEXT: v_mov_b32_e32 v3, s1
@@ -6066,7 +6059,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX8-NEXT: s_ashr_i32 s0, s19, 31
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX8-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v1, s0
; GFX8-NEXT: v_mov_b32_e32 v2, s16
; GFX8-NEXT: v_mov_b32_e32 v3, s17
@@ -6107,7 +6100,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX8-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX8-NEXT: s_ashr_i32 s4, s3, 31
; GFX8-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX8-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX8-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX8-NEXT: v_mov_b32_e32 v1, s4
; GFX8-NEXT: v_mov_b32_e32 v2, s0
; GFX8-NEXT: v_mov_b32_e32 v3, s1
@@ -6160,7 +6153,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: s_ashr_i32 s0, s19, 31
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: s_add_u32 s1, s0, 0x80000000
+; GFX9-NEXT: s_add_i32 s1, s0, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v1, s0
; GFX9-NEXT: v_mov_b32_e32 v2, s16
; GFX9-NEXT: v_mov_b32_e32 v3, s17
@@ -6201,7 +6194,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX9-NEXT: v_xor_b32_e32 v0, v1, v0
; GFX9-NEXT: s_ashr_i32 s4, s3, 31
; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
-; GFX9-NEXT: s_add_u32 s5, s4, 0x80000000
+; GFX9-NEXT: s_add_i32 s5, s4, 0x80000000
; GFX9-NEXT: v_mov_b32_e32 v1, s4
; GFX9-NEXT: v_mov_b32_e32 v2, s0
; GFX9-NEXT: v_mov_b32_e32 v3, s1
@@ -6244,7 +6237,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX10-NEXT: s_cselect_b32 s1, 1, 0
; GFX10-NEXT: s_ashr_i32 s8, s17, 31
; GFX10-NEXT: s_and_b32 s1, 1, s1
-; GFX10-NEXT: s_add_u32 s9, s8, 0x80000000
+; GFX10-NEXT: s_add_i32 s9, s8, 0x80000000
; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
@@ -6273,7 +6266,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX10-NEXT: s_cselect_b32 s5, 1, 0
; GFX10-NEXT: s_ashr_i32 s4, s3, 31
; GFX10-NEXT: s_and_b32 s5, 1, s5
-; GFX10-NEXT: s_add_u32 s0, s4, 0x80000000
+; GFX10-NEXT: s_add_i32 s0, s4, 0x80000000
; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6
; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5
@@ -6326,7 +6319,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX11-NEXT: s_cselect_b32 s1, 1, 0
; GFX11-NEXT: s_ashr_i32 s8, s19, 31
; GFX11-NEXT: s_and_b32 s1, 1, s1
-; GFX11-NEXT: s_add_u32 s9, s8, 0x80000000
+; GFX11-NEXT: s_add_i32 s9, s8, 0x80000000
; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2
; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo
; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1
@@ -6357,7 +6350,7 @@ define amdgpu_ps <2 x i128> @s_ssubsat_v2i128(<2 x i128> inreg %lhs, <2 x i128>
; GFX11-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo
; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s6
; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5
-; GFX11-NEXT: s_add_u32 s0, s4, 0x80000000
+; GFX11-NEXT: s_add_i32 s0, s4, 0x80000000
; GFX11-NEXT: v_dual_cndmask_b32 v2, v4, v3 :: v_dual_mov_b32 v3, s16
; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0
; GFX11-NEXT: v_mov_b32_e32 v0, s18
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
index 887c43f..d155513 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udiv.i64.ll
@@ -2062,13 +2062,9 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_mul_hi_u32 v17, v2, v5
; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14
@@ -2077,10 +2073,6 @@ define <2 x i64> @v_udiv_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
index 5c6bb6d..07480a0 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/urem.i64.ll
@@ -2480,13 +2480,9 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_mul_hi_u32 v17, v2, v5
; GISEL-NEXT: v_mul_hi_u32 v5, 0, v5
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v12
-; GISEL-NEXT: v_cndmask_b32_e64 v12, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v13, v6
-; GISEL-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v10, vcc, v10, v15
-; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v16, v7
-; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v8, vcc, v8, v9
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v14
@@ -2495,10 +2491,6 @@ define <2 x i64> @v_urem_v2i64_24bit(<2 x i64> %num, <2 x i64> %den) {
; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v17
; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
-; GISEL-NEXT: v_add_i32_e32 v8, vcc, v12, v8
-; GISEL-NEXT: v_add_i32_e32 v9, vcc, v13, v9
-; GISEL-NEXT: v_add_i32_e32 v10, vcc, v15, v10
-; GISEL-NEXT: v_add_i32_e32 v11, vcc, v16, v11
; GISEL-NEXT: v_add_i32_e32 v6, vcc, v6, v8
; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
; GISEL-NEXT: v_add_i32_e32 v7, vcc, v7, v10
diff --git a/llvm/test/CodeGen/AMDGPU/add_sub_u64_pseudos.mir b/llvm/test/CodeGen/AMDGPU/add_sub_u64_pseudos.mir
new file mode 100644
index 0000000..cba114c
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/add_sub_u64_pseudos.mir
@@ -0,0 +1,68 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=finalize-isel -o - %s | FileCheck -check-prefix=GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -run-pass=finalize-isel -o - %s | FileCheck -check-prefix=GFX12 %s
+
+---
+name: reg_ops
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: reg_ops
+ ; GFX11: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX11-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub1
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sreg_32 = COPY [[DEF1]].sub0
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sreg_32 = COPY [[DEF1]].sub1
+ ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], [[COPY2]], implicit-def $scc
+ ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY1]], [[COPY3]], implicit-def $scc, implicit $scc
+ ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+ ;
+ ; GFX12-LABEL: name: reg_ops
+ ; GFX12: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX12-NEXT: [[DEF1:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX12-NEXT: [[S_ADD_U64_:%[0-9]+]]:sreg_64 = S_ADD_U64 [[DEF]], [[DEF1]]
+ %0:sreg_64 = IMPLICIT_DEF
+ %1:sreg_64 = IMPLICIT_DEF
+ %2:sreg_64 = S_ADD_U64_PSEUDO %0, %1, implicit-def $scc
+...
+
+---
+name: lhs_imm
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: lhs_imm
+ ; GFX11: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub1
+ ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 6565, [[COPY]], implicit-def $scc
+ ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 0, [[COPY1]], implicit-def $scc, implicit $scc
+ ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+ ;
+ ; GFX12-LABEL: name: lhs_imm
+ ; GFX12: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX12-NEXT: [[S_ADD_U64_:%[0-9]+]]:sreg_64 = S_ADD_U64 6565, [[DEF]]
+ %0:sreg_64 = IMPLICIT_DEF
+ %1:sreg_64 = S_ADD_U64_PSEUDO 6565, %0, implicit-def $scc
+...
+
+---
+name: rhs_imm
+tracksRegLiveness: true
+body: |
+ bb.0:
+ ; GFX11-LABEL: name: rhs_imm
+ ; GFX11: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sreg_32 = COPY [[DEF]].sub1
+ ; GFX11-NEXT: [[S_ADD_U32_:%[0-9]+]]:sreg_32 = S_ADD_U32 [[COPY]], 6565, implicit-def $scc
+ ; GFX11-NEXT: [[S_ADDC_U32_:%[0-9]+]]:sreg_32 = S_ADDC_U32 [[COPY1]], 0, implicit-def $scc, implicit $scc
+ ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sreg_64 = REG_SEQUENCE [[S_ADD_U32_]], %subreg.sub0, [[S_ADDC_U32_]], %subreg.sub1
+ ;
+ ; GFX12-LABEL: name: rhs_imm
+ ; GFX12: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
+ ; GFX12-NEXT: [[S_ADD_U64_:%[0-9]+]]:sreg_64 = S_ADD_U64 [[DEF]], 6565
+ %0:sreg_64 = IMPLICIT_DEF
+ %1:sreg_64 = S_ADD_U64_PSEUDO %0, 6565, implicit-def $scc
+...
diff --git a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
index 66034af..cff9ce0 100644
--- a/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/addrspacecast-constantexpr.ll
@@ -233,9 +233,9 @@ attributes #1 = { nounwind }
; AKF_HSA: attributes #[[ATTR1]] = { nounwind }
;.
; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) }
-; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
;.
; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/allow-check.ll b/llvm/test/CodeGen/AMDGPU/allow-check.ll
new file mode 100644
index 0000000..d4f5621
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/allow-check.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -global-isel=0 -fast-isel=0 | FileCheck %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -global-isel=1 -fast-isel=0 | FileCheck %s
+; RUN: llc < %s -mtriple=amdgcn-amd-mesa3d -global-isel=0 -fast-isel=1 | FileCheck %s
+
+define i1 @test_runtime() local_unnamed_addr {
+; CHECK-LABEL: test_runtime:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, 1
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %allow = call i1 @llvm.allow.runtime.check(metadata !"test_check")
+ ret i1 %allow
+}
+
+declare i1 @llvm.allow.runtime.check(metadata) nounwind
+
+define i1 @test_ubsan() local_unnamed_addr {
+; CHECK-LABEL: test_ubsan:
+; CHECK: ; %bb.0: ; %entry
+; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT: v_mov_b32_e32 v0, 1
+; CHECK-NEXT: s_setpc_b64 s[30:31]
+entry:
+ %allow = call i1 @llvm.allow.ubsan.check(i8 7)
+ ret i1 %allow
+}
+
+declare i1 @llvm.allow.ubsan.check(i8) nounwind
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
new file mode 100644
index 0000000..33b1cc6
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-attributor-no-agpr.ll
@@ -0,0 +1,255 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-attributes --check-globals all --version 4
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx90a -passes=amdgpu-attributor %s | FileCheck %s
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; use $0", "a"(i32 poison)
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_def() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_def(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call i32 asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call i32 asm sideeffect "; def $0", "=a"()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_physreg_def_tuple() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_def_tuple(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: [[DEF:%.*]] = call i64 asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ %def = call i64 asm sideeffect "; def $0", "={a[0:1]}"()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_virtreg_second_arg() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_virtreg_second_arg(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; use $0", "v,a"(i32 poison, i32 poison)
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_non_agpr_asm() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_non_agpr_asm(
+; CHECK-SAME: ) #[[ATTR1:[0-9]+]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; use $0", "v"(i32 poison)
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_physreg() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; use $0", "{a0}"(i32 poison)
+ ret void
+}
+
+define amdgpu_kernel void @kernel_uses_asm_physreg_tuple() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_uses_asm_physreg_tuple(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; use $0", "{a[0:1]}"(i64 poison)
+ ret void
+}
+
+define void @func_uses_asm_virtreg_agpr() {
+; CHECK-LABEL: define void @func_uses_asm_virtreg_agpr(
+; CHECK-SAME: ) #[[ATTR2:[0-9]+]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; use $0", "a"(i32 poison)
+ ret void
+}
+
+define void @func_uses_asm_physreg_agpr() {
+; CHECK-LABEL: define void @func_uses_asm_physreg_agpr(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; use $0", "{a0}"(i32 poison)
+ ret void
+}
+
+define void @func_uses_asm_physreg_agpr_tuple() {
+; CHECK-LABEL: define void @func_uses_asm_physreg_agpr_tuple(
+; CHECK-SAME: ) #[[ATTR2]] {
+; CHECK-NEXT: call void asm sideeffect "
+; CHECK-NEXT: ret void
+;
+ call void asm sideeffect "; use $0", "{a[0:1]}"(i64 poison)
+ ret void
+}
+
+declare void @unknown()
+
+define amdgpu_kernel void @kernel_calls_extern() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern(
+; CHECK-SAME: ) #[[ATTR4:[0-9]+]] {
+; CHECK-NEXT: call void @unknown()
+; CHECK-NEXT: ret void
+;
+ call void @unknown()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_calls_extern_marked_callsite() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_extern_marked_callsite(
+; CHECK-SAME: ) #[[ATTR4]] {
+; CHECK-NEXT: call void @unknown() #[[ATTR9:[0-9]+]]
+; CHECK-NEXT: ret void
+;
+ call void @unknown() #0
+ ret void
+}
+
+define amdgpu_kernel void @kernel_calls_indirect(ptr %indirect) {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect(
+; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT: call void [[INDIRECT]]()
+; CHECK-NEXT: ret void
+;
+ call void %indirect()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(ptr %indirect) {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_indirect_marked_callsite(
+; CHECK-SAME: ptr [[INDIRECT:%.*]]) #[[ATTR4]] {
+; CHECK-NEXT: call void [[INDIRECT]]() #[[ATTR9]]
+; CHECK-NEXT: ret void
+;
+ call void %indirect() #0
+ ret void
+}
+
+define amdgpu_kernel void @kernel_transitively_uses_agpr_asm() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_transitively_uses_agpr_asm(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void @func_uses_asm_physreg_agpr()
+; CHECK-NEXT: ret void
+;
+ call void @func_uses_asm_physreg_agpr()
+ ret void
+}
+
+define void @empty() {
+; CHECK-LABEL: define void @empty(
+; CHECK-SAME: ) #[[ATTR5:[0-9]+]] {
+; CHECK-NEXT: ret void
+;
+ ret void
+}
+
+define void @also_empty() {
+; CHECK-LABEL: define void @also_empty(
+; CHECK-SAME: ) #[[ATTR5]] {
+; CHECK-NEXT: ret void
+;
+ ret void
+}
+
+define amdgpu_kernel void @kernel_calls_empty() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_empty(
+; CHECK-SAME: ) #[[ATTR1]] {
+; CHECK-NEXT: call void @empty()
+; CHECK-NEXT: ret void
+;
+ call void @empty()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_calls_non_agpr_and_agpr() {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_non_agpr_and_agpr(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT: call void @empty()
+; CHECK-NEXT: call void @func_uses_asm_physreg_agpr()
+; CHECK-NEXT: ret void
+;
+ call void @empty()
+ call void @func_uses_asm_physreg_agpr()
+ ret void
+}
+
+define amdgpu_kernel void @kernel_calls_generic_intrinsic(ptr %ptr0, ptr %ptr1, i64 %size) {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_generic_intrinsic(
+; CHECK-SAME: ptr [[PTR0:%.*]], ptr [[PTR1:%.*]], i64 [[SIZE:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: call void @llvm.memcpy.p0.p0.i64(ptr [[PTR0]], ptr [[PTR1]], i64 [[SIZE]], i1 false)
+; CHECK-NEXT: ret void
+;
+ call void @llvm.memcpy.p0.p0.i64(ptr %ptr0, ptr %ptr1, i64 %size, i1 false)
+ ret void
+}
+
+declare <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float, float, <32 x float>, i32 immarg, i32 immarg, i32 immarg)
+
+define amdgpu_kernel void @kernel_calls_mfma.f32.32x32x1f32(ptr addrspace(1) %out, float %a, float %b, <32 x float> %c) {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_mfma.f32.32x32x1f32(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]], float [[A:%.*]], float [[B:%.*]], <32 x float> [[C:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[RESULT:%.*]] = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float [[A]], float [[B]], <32 x float> [[C]], i32 0, i32 0, i32 0)
+; CHECK-NEXT: store <32 x float> [[RESULT]], ptr addrspace(1) [[OUT]], align 128
+; CHECK-NEXT: ret void
+;
+ %result = call <32 x float> @llvm.amdgcn.mfma.f32.32x32x1f32(float %a, float %b, <32 x float> %c, i32 0, i32 0, i32 0)
+ store <32 x float> %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @kernel_calls_workitem_id_x(ptr addrspace(1) %out) {
+; CHECK-LABEL: define amdgpu_kernel void @kernel_calls_workitem_id_x(
+; CHECK-SAME: ptr addrspace(1) [[OUT:%.*]]) #[[ATTR1]] {
+; CHECK-NEXT: [[RESULT:%.*]] = call i32 @llvm.amdgcn.workitem.id.x()
+; CHECK-NEXT: store i32 [[RESULT]], ptr addrspace(1) [[OUT]], align 4
+; CHECK-NEXT: ret void
+;
+ %result = call i32 @llvm.amdgcn.workitem.id.x()
+ store i32 %result, ptr addrspace(1) %out
+ ret void
+}
+
+define amdgpu_kernel void @indirect_calls_none_agpr(i1 %cond) {
+; CHECK-LABEL: define amdgpu_kernel void @indirect_calls_none_agpr(
+; CHECK-SAME: i1 [[COND:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[FPTR:%.*]] = select i1 [[COND]], ptr @empty, ptr @also_empty
+; CHECK-NEXT: call void [[FPTR]]()
+; CHECK-NEXT: ret void
+;
+ %fptr = select i1 %cond, ptr @empty, ptr @also_empty
+ call void %fptr()
+ ret void
+}
+
+
+attributes #0 = { "amdgpu-no-agpr" }
+;.
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3:[0-9]+]] = { "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,8" "target-cpu"="gfx90a" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6:[0-9]+]] = { convergent nocallback nofree nosync nounwind willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR7:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR8:[0-9]+]] = { nocallback nofree nounwind willreturn memory(argmem: readwrite) "target-cpu"="gfx90a" }
+; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" }
+;.
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
index 192bf7c..93b9aea 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-break-large-phis.ll
@@ -1197,3 +1197,54 @@ reallyfinally:
store <5 x double> %val, ptr %out, align 1
ret void
}
+
+define amdgpu_kernel void @pr85718(i1 %Bool, ptr %Ptr, <4 x float> %Vec1, <4 x float> %Vec2) {
+; OPT-LABEL: @pr85718(
+; OPT-NEXT: BB0:
+; OPT-NEXT: [[I:%.*]] = insertelement <4 x float> [[VEC1:%.*]], float 4.200000e+01, i1 true
+; OPT-NEXT: br label [[BB1:%.*]]
+; OPT: BB1:
+; OPT-NEXT: [[TMP0:%.*]] = phi float [ [[LARGEPHI_EXTRACTSLICE0:%.*]], [[BB2:%.*]] ], [ [[LARGEPHI_EXTRACTSLICE1:%.*]], [[BB1]] ], [ 0.000000e+00, [[BB0:%.*]] ]
+; OPT-NEXT: [[TMP1:%.*]] = phi float [ [[LARGEPHI_EXTRACTSLICE3:%.*]], [[BB2]] ], [ [[LARGEPHI_EXTRACTSLICE4:%.*]], [[BB1]] ], [ 0.000000e+00, [[BB0]] ]
+; OPT-NEXT: [[TMP2:%.*]] = phi float [ [[LARGEPHI_EXTRACTSLICE6:%.*]], [[BB2]] ], [ [[LARGEPHI_EXTRACTSLICE7:%.*]], [[BB1]] ], [ 0.000000e+00, [[BB0]] ]
+; OPT-NEXT: [[TMP3:%.*]] = phi float [ [[LARGEPHI_EXTRACTSLICE9:%.*]], [[BB2]] ], [ [[LARGEPHI_EXTRACTSLICE10:%.*]], [[BB1]] ], [ 0.000000e+00, [[BB0]] ]
+; OPT-NEXT: [[LARGEPHI_INSERTSLICE0:%.*]] = insertelement <4 x float> poison, float [[TMP0]], i64 0
+; OPT-NEXT: [[LARGEPHI_INSERTSLICE1:%.*]] = insertelement <4 x float> [[LARGEPHI_INSERTSLICE0]], float [[TMP1]], i64 1
+; OPT-NEXT: [[LARGEPHI_INSERTSLICE2:%.*]] = insertelement <4 x float> [[LARGEPHI_INSERTSLICE1]], float [[TMP2]], i64 2
+; OPT-NEXT: [[LARGEPHI_INSERTSLICE3:%.*]] = insertelement <4 x float> [[LARGEPHI_INSERTSLICE2]], float [[TMP3]], i64 3
+; OPT-NEXT: store <4 x float> [[LARGEPHI_INSERTSLICE3]], ptr [[PTR:%.*]], align 128
+; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE1]] = extractelement <4 x float> [[VEC2:%.*]], i64 0
+; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE4]] = extractelement <4 x float> [[VEC2]], i64 1
+; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE7]] = extractelement <4 x float> [[VEC2]], i64 2
+; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE10]] = extractelement <4 x float> [[VEC2]], i64 3
+; OPT-NEXT: br i1 [[BOOL:%.*]], label [[BB1]], label [[BB2]]
+; OPT: BB2:
+; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE0]] = extractelement <4 x float> [[I]], i64 0
+; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE3]] = extractelement <4 x float> [[I]], i64 1
+; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE6]] = extractelement <4 x float> [[I]], i64 2
+; OPT-NEXT: [[LARGEPHI_EXTRACTSLICE9]] = extractelement <4 x float> [[I]], i64 3
+; OPT-NEXT: br label [[BB1]]
+;
+; NOOPT-LABEL: @pr85718(
+; NOOPT-NEXT: BB0:
+; NOOPT-NEXT: [[I:%.*]] = insertelement <4 x float> [[VEC1:%.*]], float 4.200000e+01, i1 true
+; NOOPT-NEXT: br label [[BB1:%.*]]
+; NOOPT: BB1:
+; NOOPT-NEXT: [[PHI:%.*]] = phi <4 x float> [ [[I]], [[BB2:%.*]] ], [ [[VEC2:%.*]], [[BB1]] ], [ zeroinitializer, [[BB0:%.*]] ]
+; NOOPT-NEXT: store <4 x float> [[PHI]], ptr [[PTR:%.*]], align 128
+; NOOPT-NEXT: br i1 [[BOOL:%.*]], label [[BB1]], label [[BB2]]
+; NOOPT: BB2:
+; NOOPT-NEXT: br label [[BB1]]
+;
+BB0:
+ %I = insertelement <4 x float> %Vec1, float 4.200000e+01, i1 true
+ br label %BB1
+
+BB1: ; preds = %BB0, %BB1, %BB2
+ %PHI = phi <4 x float> [ %I, %BB2 ], [ %Vec2, %BB1 ], [ zeroinitializer, %BB0 ]
+ store <4 x float> %PHI, ptr %Ptr, align 128
+ br i1 %Bool, label %BB1, label %BB2
+
+BB2: ; preds = %BB1
+ br label %BB1
+}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
index d900165..2ad28b8 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-codegenprepare-idiv.ll
@@ -10668,3 +10668,111 @@ define amdgpu_kernel void @srem_v2i64_pow2_shl_denom(ptr addrspace(1) %out, <2 x
store <2 x i64> %r, ptr addrspace(1) %out
ret void
}
+
+define <2 x i32> @v_sdiv_i32_exact(<2 x i32> %num) {
+; CHECK-LABEL: @v_sdiv_i32_exact(
+; CHECK: %1 = extractelement <2 x i32> %num, i64 0
+; CHECK-NEXT: %2 = sdiv exact i32 %1, 4096
+; CHECK-NEXT: %3 = insertelement <2 x i32> poison, i32 %2, i64 0
+; CHECK-NEXT: %4 = extractelement <2 x i32> %num, i64 1
+; CHECK-NEXT: %5 = sdiv exact i32 %4, 1024
+; CHECK-NEXT: %6 = insertelement <2 x i32> %3, i32 %5, i64 1
+; CHECK-NEXT: ret <2 x i32> %6
+;
+; GFX6-LABEL: v_sdiv_i32_exact:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_ashrrev_i32_e32 v0, 12, v0
+; GFX6-NEXT: v_ashrrev_i32_e32 v1, 10, v1
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_sdiv_i32_exact:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_ashrrev_i32_e32 v0, 12, v0
+; GFX9-NEXT: v_ashrrev_i32_e32 v1, 10, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %result = sdiv exact <2 x i32> %num, <i32 4096, i32 1024>
+ ret <2 x i32> %result
+}
+
+define <2 x i64> @v_sdiv_i64_exact(<2 x i64> %num) {
+; CHECK-LABEL: @v_sdiv_i64_exact(
+; CHECK: %1 = extractelement <2 x i64> %num, i64 0
+; CHECK-NEXT: %2 = sdiv exact i64 %1, 4096
+; CHECK-NEXT: %3 = insertelement <2 x i64> poison, i64 %2, i64 0
+; CHECK-NEXT: %4 = extractelement <2 x i64> %num, i64 1
+; CHECK-NEXT: %5 = sdiv exact i64 %4, 1024
+; CHECK-NEXT: %6 = insertelement <2 x i64> %3, i64 %5, i64 1
+; CHECK-NEXT: ret <2 x i64> %6
+;
+; GFX6-LABEL: v_sdiv_i64_exact:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_ashr_i64 v[0:1], v[0:1], 12
+; GFX6-NEXT: v_ashr_i64 v[2:3], v[2:3], 10
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_sdiv_i64_exact:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_ashrrev_i64 v[0:1], 12, v[0:1]
+; GFX9-NEXT: v_ashrrev_i64 v[2:3], 10, v[2:3]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %result = sdiv exact <2 x i64> %num, <i64 4096, i64 1024>
+ ret <2 x i64> %result
+}
+
+define <2 x i32> @v_udiv_i32_exact(<2 x i32> %num) {
+; CHECK-LABEL: @v_udiv_i32_exact(
+; CHECK: %1 = extractelement <2 x i32> %num, i64 0
+; CHECK-NEXT: %2 = udiv exact i32 %1, 4096
+; CHECK-NEXT: %3 = insertelement <2 x i32> poison, i32 %2, i64 0
+; CHECK-NEXT: %4 = extractelement <2 x i32> %num, i64 1
+; CHECK-NEXT: %5 = udiv exact i32 %4, 1024
+; CHECK-NEXT: %6 = insertelement <2 x i32> %3, i32 %5, i64 1
+; CHECK-NEXT: ret <2 x i32> %6
+;
+; GFX6-LABEL: v_udiv_i32_exact:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_lshrrev_b32_e32 v0, 12, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 10, v1
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_udiv_i32_exact:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b32_e32 v0, 12, v0
+; GFX9-NEXT: v_lshrrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %result = udiv exact <2 x i32> %num, <i32 4096, i32 1024>
+ ret <2 x i32> %result
+}
+
+define <2 x i64> @v_udiv_i64_exact(<2 x i64> %num) {
+; CHECK-LABEL: @v_udiv_i64_exact(
+; CHECK: %1 = extractelement <2 x i64> %num, i64 0
+; CHECK-NEXT: %2 = udiv exact i64 %1, 4096
+; CHECK-NEXT: %3 = insertelement <2 x i64> poison, i64 %2, i64 0
+; CHECK-NEXT: %4 = extractelement <2 x i64> %num, i64 1
+; CHECK-NEXT: %5 = udiv exact i64 %4, 1024
+; CHECK-NEXT: %6 = insertelement <2 x i64> %3, i64 %5, i64 1
+; CHECK-NEXT: ret <2 x i64> %6
+;
+; GFX6-LABEL: v_udiv_i64_exact:
+; GFX6: ; %bb.0:
+; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX6-NEXT: v_lshr_b64 v[0:1], v[0:1], 12
+; GFX6-NEXT: v_lshr_b64 v[2:3], v[2:3], 10
+; GFX6-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: v_udiv_i64_exact:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_lshrrev_b64 v[0:1], 12, v[0:1]
+; GFX9-NEXT: v_lshrrev_b64 v[2:3], 10, v[2:3]
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+ %result = udiv exact <2 x i64> %num, <i64 4096, i64 1024>
+ ret <2 x i64> %result
+}
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
index 942f459..8ddaf24 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-pown.ll
@@ -808,7 +808,7 @@ define float @test_pown_fast_f32_nobuiltin(float %x, i32 %y) {
; CHECK-LABEL: define float @test_pown_fast_f32_nobuiltin
; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[CALL:%.*]] = tail call fast float @_Z4pownfi(float [[X]], i32 [[Y]]) #[[ATTR3:[0-9]+]]
+; CHECK-NEXT: [[CALL:%.*]] = tail call fast float @_Z4pownfi(float [[X]], i32 [[Y]]) #[[ATTR4:[0-9]+]]
; CHECK-NEXT: ret float [[CALL]]
;
entry:
@@ -820,11 +820,11 @@ define float @test_pown_fast_f32_strictfp(float %x, i32 %y) #1 {
; CHECK-LABEL: define float @test_pown_fast_f32_strictfp
; CHECK-SAME: (float [[X:%.*]], i32 [[Y:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[__FABS:%.*]] = call fast float @llvm.fabs.f32(float [[X]])
-; CHECK-NEXT: [[__LOG2:%.*]] = call fast float @llvm.log2.f32(float [[__FABS]])
-; CHECK-NEXT: [[POWNI2F:%.*]] = sitofp i32 [[Y]] to float
-; CHECK-NEXT: [[__YLOGX:%.*]] = fmul fast float [[__LOG2]], [[POWNI2F]]
-; CHECK-NEXT: [[__EXP2:%.*]] = call fast float @llvm.exp2.f32(float [[__YLOGX]])
+; CHECK-NEXT: [[__FABS:%.*]] = call fast float @llvm.fabs.f32(float [[X]]) #[[ATTR0]]
+; CHECK-NEXT: [[__LOG2:%.*]] = call fast float @llvm.log2.f32(float [[__FABS]]) #[[ATTR0]]
+; CHECK-NEXT: [[POWNI2F:%.*]] = call fast float @llvm.experimental.constrained.sitofp.f32.i32(i32 [[Y]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR0]]
+; CHECK-NEXT: [[__YLOGX:%.*]] = call fast float @llvm.experimental.constrained.fmul.f32(float [[POWNI2F]], float [[__LOG2]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR0]]
+; CHECK-NEXT: [[__EXP2:%.*]] = call fast float @llvm.exp2.f32(float [[__YLOGX]]) #[[ATTR0]]
; CHECK-NEXT: [[__YEVEN:%.*]] = shl i32 [[Y]], 31
; CHECK-NEXT: [[TMP0:%.*]] = bitcast float [[X]] to i32
; CHECK-NEXT: [[__POW_SIGN:%.*]] = and i32 [[__YEVEN]], [[TMP0]]
diff --git a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
index 2ffa647..2e64a34 100644
--- a/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
+++ b/llvm/test/CodeGen/AMDGPU/amdgpu-simplify-libcall-rootn.ll
@@ -896,7 +896,7 @@ define float @test_rootn_f32__y_neg2__strictfp(float %x) #1 {
; CHECK-LABEL: define float @test_rootn_f32__y_neg2__strictfp(
; CHECK-SAME: float [[X:%.*]]) #[[ATTR0]] {
; CHECK-NEXT: entry:
-; CHECK-NEXT: [[__ROOTN2RSQRT:%.*]] = call float @_Z5rsqrtf(float [[X]])
+; CHECK-NEXT: [[__ROOTN2RSQRT:%.*]] = call float @_Z5rsqrtf(float [[X]]) #[[ATTR0]]
; CHECK-NEXT: ret float [[__ROOTN2RSQRT]]
;
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
index af0eb23..3d4ae84d9 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa-call.ll
@@ -1025,33 +1025,33 @@ attributes #6 = { "enqueued-block" }
; AKF_HSA: attributes #[[ATTR8]] = { "amdgpu-calls" }
;.
; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="fiji" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR14]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "target-cpu"="gfx900" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR15]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "target-cpu"="fiji" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR16]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR17]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR18]] = { nounwind "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind sanitize_address "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR20]] = { nounwind sanitize_address "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR19]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR20]] = { nounwind sanitize_address "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR21]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR22]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR23:[0-9]+]] = { nounwind sanitize_address "amdgpu-no-implicitarg-ptr" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR24:[0-9]+]] = { "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR25]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "enqueued-block" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR26]] = { "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR27]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR27]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
; ATTRIBUTOR_HSA: attributes #[[ATTR28]] = { nounwind }
; ATTRIBUTOR_HSA: attributes #[[ATTR29]] = { "enqueued-block" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
index 9a9c28a..43cdf85 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features-hsa.ll
@@ -643,19 +643,19 @@ attributes #1 = { nounwind }
; AKF_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-stack-objects" }
;.
; ATTRIBUTOR_HSA: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR10]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR11]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR12]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_HSA: attributes #[[ATTR13]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
;.
; AKF_HSA: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 500}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
index 6c5e58c..547ff69 100644
--- a/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
+++ b/llvm/test/CodeGen/AMDGPU/annotate-kernel-features.ll
@@ -393,17 +393,18 @@ define amdgpu_kernel void @use_get_local_size_z(ptr addrspace(1) %ptr) #1 {
attributes #0 = { nounwind readnone }
attributes #1 = { nounwind }
+;.
; AKF_CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
; AKF_CHECK: attributes #[[ATTR1]] = { nounwind }
;.
; ATTRIBUTOR_CHECK: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
-; ATTRIBUTOR_CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR1]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR3]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR4]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR5]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR6]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR7]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR8]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_CHECK: attributes #[[ATTR9]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workitem-id-x" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
index 1ebd864..2970495 100644
--- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
+++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_pixelshader.ll
@@ -477,7 +477,6 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX1032-NEXT: s_cbranch_execz .LBB1_3
; GFX1032-NEXT: ; %bb.2:
; GFX1032-NEXT: v_mov_b32_e32 v0, s11
-; GFX1032-NEXT: s_mov_b32 s10, s11
; GFX1032-NEXT: buffer_atomic_add v0, off, s[4:7], 0 glc
; GFX1032-NEXT: .LBB1_3:
; GFX1032-NEXT: s_waitcnt_depctr 0xffe3
@@ -615,7 +614,6 @@ define amdgpu_ps void @add_i32_varying(ptr addrspace(8) inreg %out, ptr addrspac
; GFX1132-NEXT: s_cbranch_execz .LBB1_3
; GFX1132-NEXT: ; %bb.2:
; GFX1132-NEXT: v_mov_b32_e32 v0, s11
-; GFX1132-NEXT: s_mov_b32 s10, s11
; GFX1132-NEXT: buffer_atomic_add_u32 v0, off, s[4:7], 0 glc
; GFX1132-NEXT: .LBB1_3:
; GFX1132-NEXT: s_or_b32 exec_lo, exec_lo, s9
diff --git a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
index c1da29e..3228962 100644
--- a/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
+++ b/llvm/test/CodeGen/AMDGPU/av_spill_cross_bb_usage.mir
@@ -14,6 +14,8 @@
---
name: test_av_spill_cross_bb_usage
tracksRegLiveness: true
+frameInfo:
+ adjustsStack: true
stack:
- { id: 0, name: '', type: spill-slot, offset: 0, size: 4, alignment: 4 }
machineFunctionInfo:
diff --git a/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
new file mode 100644
index 0000000..7108f3d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/bf16-conversions.ll
@@ -0,0 +1,357 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -march=amdgcn -mcpu=gfx940 < %s | FileCheck --check-prefixes=GCN %s
+
+; TODO: Add global-isel when it can support bf16
+
+define amdgpu_ps float @v_test_cvt_bf16_f32_v(bfloat %v) {
+; GCN-LABEL: v_test_cvt_bf16_f32_v:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; GCN-NEXT: ; return to shader part epilog
+ %cvt = fpext bfloat %v to float
+ ret float %cvt
+}
+
+define amdgpu_ps float @v_test_cvt_bf16_f32_s(bfloat inreg %v) {
+; GCN-LABEL: v_test_cvt_bf16_f32_s:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_lshl_b32 s0, s0, 16
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+ %cvt = fpext bfloat %v to float
+ ret float %cvt
+}
+
+define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_v(<2 x float> %src) {
+; GCN-LABEL: v_test_cvt_v2f32_v2bf16_v:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GCN-NEXT: s_movk_i32 s0, 0x7fff
+; GCN-NEXT: v_add3_u32 v2, v2, v0, s0
+; GCN-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GCN-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GCN-NEXT: v_add3_u32 v2, v2, v1, s0
+; GCN-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GCN-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GCN-NEXT: s_mov_b32 s0, 0x7060302
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GCN-NEXT: v_perm_b32 v0, v1, v0, s0
+; GCN-NEXT: ; return to shader part epilog
+ %res = fptrunc <2 x float> %src to <2 x bfloat>
+ %cast = bitcast <2 x bfloat> %res to float
+ ret float %cast
+}
+
+define amdgpu_ps float @v_test_cvt_v2f32_v2bf16_s(<2 x float> inreg %src) {
+; GCN-LABEL: v_test_cvt_v2f32_v2bf16_s:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_bfe_u32 s2, s1, 0x10010
+; GCN-NEXT: s_add_i32 s2, s2, s1
+; GCN-NEXT: s_or_b32 s4, s1, 0x400000
+; GCN-NEXT: s_add_i32 s5, s2, 0x7fff
+; GCN-NEXT: v_cmp_u_f32_e64 s[2:3], s1, s1
+; GCN-NEXT: s_and_b64 s[2:3], s[2:3], exec
+; GCN-NEXT: s_cselect_b32 s2, s4, s5
+; GCN-NEXT: s_bfe_u32 s1, s0, 0x10010
+; GCN-NEXT: s_add_i32 s1, s1, s0
+; GCN-NEXT: s_or_b32 s3, s0, 0x400000
+; GCN-NEXT: s_add_i32 s4, s1, 0x7fff
+; GCN-NEXT: v_cmp_u_f32_e64 s[0:1], s0, s0
+; GCN-NEXT: s_and_b64 s[0:1], s[0:1], exec
+; GCN-NEXT: s_cselect_b32 s0, s3, s4
+; GCN-NEXT: s_pack_hh_b32_b16 s0, s0, s2
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: ; return to shader part epilog
+ %res = fptrunc <2 x float> %src to <2 x bfloat>
+ %cast = bitcast <2 x bfloat> %res to float
+ ret float %cast
+}
+
+define amdgpu_ps float @v_test_cvt_f32_bf16_v(float %src) {
+; GCN-LABEL: v_test_cvt_f32_bf16_v:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GCN-NEXT: s_movk_i32 s0, 0x7fff
+; GCN-NEXT: v_add3_u32 v1, v1, v0, s0
+; GCN-NEXT: v_or_b32_e32 v2, 0x400000, v0
+; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v2, vcc
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: ; return to shader part epilog
+ %trunc = fptrunc float %src to bfloat
+ %ext = fpext bfloat %trunc to float
+ ret float %ext
+}
+
+define amdgpu_ps float @v_test_cvt_v2f64_v2bf16_v(<2 x double> %src) {
+; GCN-LABEL: v_test_cvt_v2f64_v2bf16_v:
+; GCN: ; %bb.0:
+; GCN-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
+; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
+; GCN-NEXT: v_and_b32_e32 v7, 1, v6
+; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
+; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
+; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3]
+; GCN-NEXT: v_add_u32_e32 v4, v6, v4
+; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc
+; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; GCN-NEXT: s_brev_b32 s4, 1
+; GCN-NEXT: v_and_or_b32 v5, v1, s4, v4
+; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1
+; GCN-NEXT: s_movk_i32 s5, 0x7fff
+; GCN-NEXT: v_add3_u32 v4, v4, v5, s5
+; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5
+; GCN-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v5, vcc
+; GCN-NEXT: v_cvt_f32_f64_e64 v5, |v[2:3]|
+; GCN-NEXT: v_cvt_f64_f32_e32 v[0:1], v5
+; GCN-NEXT: v_and_b32_e32 v6, 1, v5
+; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[2:3]|, v[0:1]
+; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[2:3]|, v[0:1]
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6
+; GCN-NEXT: v_cndmask_b32_e64 v0, -1, 1, s[2:3]
+; GCN-NEXT: v_add_u32_e32 v0, v5, v0
+; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v5, vcc
+; GCN-NEXT: v_and_or_b32 v1, v3, s4, v0
+; GCN-NEXT: v_bfe_u32 v0, v0, 16, 1
+; GCN-NEXT: v_add3_u32 v0, v0, v1, s5
+; GCN-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; GCN-NEXT: v_cmp_u_f64_e32 vcc, v[2:3], v[2:3]
+; GCN-NEXT: s_mov_b32 s0, 0x7060302
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
+; GCN-NEXT: v_perm_b32 v0, v0, v4, s0
+; GCN-NEXT: ; return to shader part epilog
+ %res = fptrunc <2 x double> %src to <2 x bfloat>
+ %cast = bitcast <2 x bfloat> %res to float
+ ret float %cast
+}
+
+define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16(float %a, float %b) {
+; GCN-LABEL: fptrunc_f32_f32_to_v2bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_bfe_u32 v2, v0, 16, 1
+; GCN-NEXT: s_movk_i32 s0, 0x7fff
+; GCN-NEXT: v_add3_u32 v2, v2, v0, s0
+; GCN-NEXT: v_or_b32_e32 v3, 0x400000, v0
+; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc
+; GCN-NEXT: v_bfe_u32 v2, v1, 16, 1
+; GCN-NEXT: v_add3_u32 v2, v2, v1, s0
+; GCN-NEXT: v_or_b32_e32 v3, 0x400000, v1
+; GCN-NEXT: v_cmp_u_f32_e32 vcc, v1, v1
+; GCN-NEXT: s_mov_b32 s0, 0x7060302
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc
+; GCN-NEXT: v_perm_b32 v0, v1, v0, s0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %a.cvt = fptrunc float %a to bfloat
+ %b.cvt = fptrunc float %b to bfloat
+ %v2.1 = insertelement <2 x bfloat> undef, bfloat %a.cvt, i32 0
+ %v2.2 = insertelement <2 x bfloat> %v2.1, bfloat %b.cvt, i32 1
+ %ret = bitcast <2 x bfloat> %v2.2 to float
+ ret float %ret
+}
+
+define amdgpu_ps float @fptrunc_f32_f32_to_v2bf16_mods(float %a, float %b) {
+; GCN-LABEL: fptrunc_f32_f32_to_v2bf16_mods:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_xor_b32_e32 v2, 0x80000000, v0
+; GCN-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GCN-NEXT: s_movk_i32 s0, 0x7fff
+; GCN-NEXT: v_add3_u32 v3, v3, v2, s0
+; GCN-NEXT: v_or_b32_e32 v2, 0x400000, v2
+; GCN-NEXT: v_cmp_u_f32_e64 vcc, -v0, -v0
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc
+; GCN-NEXT: v_and_b32_e32 v2, 0x7fffffff, v1
+; GCN-NEXT: v_bfe_u32 v3, v2, 16, 1
+; GCN-NEXT: v_add3_u32 v3, v3, v2, s0
+; GCN-NEXT: v_or_b32_e32 v2, 0x400000, v2
+; GCN-NEXT: v_cmp_u_f32_e64 vcc, |v1|, |v1|
+; GCN-NEXT: s_mov_b32 s0, 0x7060302
+; GCN-NEXT: s_nop 0
+; GCN-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
+; GCN-NEXT: v_perm_b32 v0, v1, v0, s0
+; GCN-NEXT: ; return to shader part epilog
+entry:
+ %a.neg = fneg float %a
+ %a.cvt = fptrunc float %a.neg to bfloat
+ %b.abs = call float @llvm.fabs.f32(float %b)
+ %b.cvt = fptrunc float %b.abs to bfloat
+ %v2.1 = insertelement <2 x bfloat> undef, bfloat %a.cvt, i32 0
+ %v2.2 = insertelement <2 x bfloat> %v2.1, bfloat %b.cvt, i32 1
+ %ret = bitcast <2 x bfloat> %v2.2 to float
+ ret float %ret
+}
+
+define amdgpu_ps void @fptrunc_f32_to_bf16(float %a, ptr %out) {
+; GCN-LABEL: fptrunc_f32_to_bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v3, v2
+; GCN-NEXT: v_mov_b32_e32 v2, v1
+; GCN-NEXT: v_bfe_u32 v1, v0, 16, 1
+; GCN-NEXT: s_movk_i32 s0, 0x7fff
+; GCN-NEXT: v_add3_u32 v1, v1, v0, s0
+; GCN-NEXT: v_or_b32_e32 v4, 0x400000, v0
+; GCN-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_cndmask_b32_e32 v0, v1, v4, vcc
+; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GCN-NEXT: s_endpgm
+entry:
+ %a.cvt = fptrunc float %a to bfloat
+ store bfloat %a.cvt, ptr %out
+ ret void
+}
+
+define amdgpu_ps void @fptrunc_f32_to_bf16_abs(float %a, ptr %out) {
+; GCN-LABEL: fptrunc_f32_to_bf16_abs:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v3, v2
+; GCN-NEXT: v_mov_b32_e32 v2, v1
+; GCN-NEXT: v_and_b32_e32 v1, 0x7fffffff, v0
+; GCN-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GCN-NEXT: s_movk_i32 s0, 0x7fff
+; GCN-NEXT: v_add3_u32 v4, v4, v1, s0
+; GCN-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; GCN-NEXT: v_cmp_u_f32_e64 vcc, |v0|, |v0|
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GCN-NEXT: s_endpgm
+entry:
+ %a.abs = call float @llvm.fabs.f32(float %a)
+ %a.cvt = fptrunc float %a.abs to bfloat
+ store bfloat %a.cvt, ptr %out
+ ret void
+}
+
+define amdgpu_ps void @fptrunc_f32_to_bf16_neg(float %a, ptr %out) {
+; GCN-LABEL: fptrunc_f32_to_bf16_neg:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_mov_b32_e32 v3, v2
+; GCN-NEXT: v_mov_b32_e32 v2, v1
+; GCN-NEXT: v_xor_b32_e32 v1, 0x80000000, v0
+; GCN-NEXT: v_bfe_u32 v4, v1, 16, 1
+; GCN-NEXT: s_movk_i32 s0, 0x7fff
+; GCN-NEXT: v_add3_u32 v4, v4, v1, s0
+; GCN-NEXT: v_or_b32_e32 v1, 0x400000, v1
+; GCN-NEXT: v_cmp_u_f32_e64 vcc, -v0, -v0
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v1, vcc
+; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GCN-NEXT: s_endpgm
+entry:
+ %a.neg = fneg float %a
+ %a.cvt = fptrunc float %a.neg to bfloat
+ store bfloat %a.cvt, ptr %out
+ ret void
+}
+
+define amdgpu_ps void @fptrunc_f64_to_bf16(double %a, ptr %out) {
+; GCN-LABEL: fptrunc_f64_to_bf16:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_cvt_f32_f64_e64 v6, |v[0:1]|
+; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v6
+; GCN-NEXT: v_and_b32_e32 v7, 1, v6
+; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
+; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v7
+; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3]
+; GCN-NEXT: v_add_u32_e32 v4, v6, v4
+; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc
+; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc
+; GCN-NEXT: s_brev_b32 s0, 1
+; GCN-NEXT: v_and_or_b32 v5, v1, s0, v4
+; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1
+; GCN-NEXT: s_movk_i32 s0, 0x7fff
+; GCN-NEXT: v_add3_u32 v4, v4, v5, s0
+; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5
+; GCN-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GCN-NEXT: s_endpgm
+entry:
+ %a.cvt = fptrunc double %a to bfloat
+ store bfloat %a.cvt, ptr %out
+ ret void
+}
+
+define amdgpu_ps void @fptrunc_f64_to_bf16_neg(double %a, ptr %out) {
+; GCN-LABEL: fptrunc_f64_to_bf16_neg:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]|
+; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v7
+; GCN-NEXT: v_and_b32_e32 v8, 1, v7
+; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
+; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
+; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3]
+; GCN-NEXT: v_add_u32_e32 v4, v7, v4
+; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc
+; GCN-NEXT: s_brev_b32 s4, 1
+; GCN-NEXT: v_xor_b32_e32 v6, 0x80000000, v1
+; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; GCN-NEXT: v_and_or_b32 v5, v6, s4, v4
+; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1
+; GCN-NEXT: s_movk_i32 s0, 0x7fff
+; GCN-NEXT: v_add3_u32 v4, v4, v5, s0
+; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5
+; GCN-NEXT: v_cmp_u_f64_e64 vcc, -v[0:1], -v[0:1]
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GCN-NEXT: s_endpgm
+entry:
+ %a.neg = fneg double %a
+ %a.cvt = fptrunc double %a.neg to bfloat
+ store bfloat %a.cvt, ptr %out
+ ret void
+}
+
+define amdgpu_ps void @fptrunc_f64_to_bf16_abs(double %a, ptr %out) {
+; GCN-LABEL: fptrunc_f64_to_bf16_abs:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: v_cvt_f32_f64_e64 v7, |v[0:1]|
+; GCN-NEXT: v_cvt_f64_f32_e32 v[4:5], v7
+; GCN-NEXT: v_and_b32_e32 v8, 1, v7
+; GCN-NEXT: v_cmp_gt_f64_e64 s[2:3], |v[0:1]|, v[4:5]
+; GCN-NEXT: v_cmp_nlg_f64_e64 s[0:1], |v[0:1]|, v[4:5]
+; GCN-NEXT: v_cmp_eq_u32_e32 vcc, 1, v8
+; GCN-NEXT: v_cndmask_b32_e64 v4, -1, 1, s[2:3]
+; GCN-NEXT: v_add_u32_e32 v4, v7, v4
+; GCN-NEXT: s_or_b64 vcc, s[0:1], vcc
+; GCN-NEXT: v_and_b32_e32 v6, 0x7fffffff, v1
+; GCN-NEXT: v_cndmask_b32_e32 v4, v4, v7, vcc
+; GCN-NEXT: s_brev_b32 s0, 1
+; GCN-NEXT: v_and_or_b32 v5, v6, s0, v4
+; GCN-NEXT: v_bfe_u32 v4, v4, 16, 1
+; GCN-NEXT: s_movk_i32 s0, 0x7fff
+; GCN-NEXT: v_add3_u32 v4, v4, v5, s0
+; GCN-NEXT: v_or_b32_e32 v5, 0x400000, v5
+; GCN-NEXT: v_cmp_u_f64_e64 vcc, |v[0:1]|, |v[0:1]|
+; GCN-NEXT: s_nop 1
+; GCN-NEXT: v_cndmask_b32_e32 v0, v4, v5, vcc
+; GCN-NEXT: flat_store_short_d16_hi v[2:3], v0 sc0 sc1
+; GCN-NEXT: s_endpgm
+entry:
+ %a.abs = call double @llvm.fabs.f64(double %a)
+ %a.cvt = fptrunc double %a.abs to bfloat
+ store bfloat %a.cvt, ptr %out
+ ret void
+}
+
+declare float @llvm.fabs.f32(float)
+declare double @llvm.fabs.f64(double)
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index ebb77c1..9865883 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -16968,7 +16968,7 @@ define bfloat @v_fabs_bf16(bfloat %a) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
@@ -16977,7 +16977,7 @@ define bfloat @v_fabs_bf16(bfloat %a) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -17163,9 +17163,9 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GCN-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v0, -1.0, v0
+; GCN-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
@@ -17174,9 +17174,9 @@ define bfloat @v_fneg_fabs_bf16(bfloat %a) {
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e64 v0, 1.0, |v0|
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fffffff, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, -1.0, v0
+; GFX7-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -17280,8 +17280,6 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -17293,8 +17291,6 @@ define bfloat @v_minnum_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_min_f32_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -17375,10 +17371,6 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v1, v1, v3
; GCN-NEXT: v_min_f32_e32 v0, v0, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
@@ -17396,10 +17388,6 @@ define <2 x bfloat> @v_minnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_min_f32_e32 v1, v1, v3
; GFX7-NEXT: v_min_f32_e32 v0, v0, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
@@ -17522,12 +17510,6 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v2, v2, v5
; GCN-NEXT: v_min_f32_e32 v1, v1, v4
; GCN-NEXT: v_min_f32_e32 v0, v0, v3
@@ -17551,12 +17533,6 @@ define <3 x bfloat> @v_minnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_min_f32_e32 v2, v2, v5
; GFX7-NEXT: v_min_f32_e32 v1, v1, v4
; GFX7-NEXT: v_min_f32_e32 v0, v0, v3
@@ -17688,14 +17664,6 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v3, v3, v7
; GCN-NEXT: v_min_f32_e32 v2, v2, v6
; GCN-NEXT: v_min_f32_e32 v1, v1, v5
@@ -17725,14 +17693,6 @@ define <4 x bfloat> @v_minnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_min_f32_e32 v3, v3, v7
; GFX7-NEXT: v_min_f32_e32 v2, v2, v6
; GFX7-NEXT: v_min_f32_e32 v1, v1, v5
@@ -17951,22 +17911,6 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v7, v7, v15
; GCN-NEXT: v_min_f32_e32 v6, v6, v14
; GCN-NEXT: v_min_f32_e32 v5, v5, v13
@@ -18020,22 +17964,6 @@ define <8 x bfloat> @v_minnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_min_f32_e32 v7, v7, v15
; GFX7-NEXT: v_min_f32_e32 v6, v6, v14
; GFX7-NEXT: v_min_f32_e32 v5, v5, v13
@@ -18382,71 +18310,51 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_min_f32_e32 v14, v14, v30
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_min_f32_e32 v13, v13, v29
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_min_f32_e32 v12, v12, v28
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_min_f32_e32 v11, v11, v27
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_min_f32_e32 v10, v10, v26
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_min_f32_e32 v9, v9, v25
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_min_f32_e32 v8, v8, v24
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_min_f32_e32 v7, v7, v23
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_min_f32_e32 v6, v6, v22
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_min_f32_e32 v5, v5, v21
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
@@ -18461,8 +18369,6 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_min_f32_e32 v4, v4, v20
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
@@ -18474,21 +18380,10 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_min_f32_e32 v3, v3, v19
; GCN-NEXT: v_min_f32_e32 v2, v2, v18
; GCN-NEXT: v_min_f32_e32 v1, v1, v17
; GCN-NEXT: v_min_f32_e32 v0, v0, v16
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
@@ -18503,8 +18398,9 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_min_f32_e32 v15, v15, v16
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
@@ -18513,14 +18409,12 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-LABEL: v_minnum_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_min_f32_e32 v9, v9, v25
-; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_min_f32_e32 v6, v6, v22
+; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
@@ -18531,13 +18425,13 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
@@ -18560,13 +18454,13 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
@@ -18579,48 +18473,14 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_min_f32_e32 v14, v14, v30
; GFX7-NEXT: v_min_f32_e32 v13, v13, v29
; GFX7-NEXT: v_min_f32_e32 v12, v12, v28
; GFX7-NEXT: v_min_f32_e32 v11, v11, v27
; GFX7-NEXT: v_min_f32_e32 v10, v10, v26
-; GFX7-NEXT: v_min_f32_e32 v15, v15, v25
+; GFX7-NEXT: v_min_f32_e32 v9, v9, v25
; GFX7-NEXT: v_min_f32_e32 v8, v8, v24
; GFX7-NEXT: v_min_f32_e32 v7, v7, v23
-; GFX7-NEXT: v_min_f32_e32 v6, v6, v22
; GFX7-NEXT: v_min_f32_e32 v5, v5, v21
; GFX7-NEXT: v_min_f32_e32 v4, v4, v20
; GFX7-NEXT: v_min_f32_e32 v3, v3, v19
@@ -18634,6 +18494,10 @@ define <16 x bfloat> @v_minnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_min_f32_e32 v15, v15, v22
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
@@ -19267,287 +19131,223 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
; GCN-NEXT: v_min_f32_e32 v31, v31, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
; GCN-NEXT: v_min_f32_e32 v30, v30, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
; GCN-NEXT: v_min_f32_e32 v29, v29, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
; GCN-NEXT: v_min_f32_e32 v28, v28, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
; GCN-NEXT: v_min_f32_e32 v27, v27, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
; GCN-NEXT: v_min_f32_e32 v26, v26, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100
; GCN-NEXT: v_min_f32_e32 v25, v25, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
; GCN-NEXT: v_min_f32_e32 v24, v24, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
; GCN-NEXT: v_min_f32_e32 v23, v23, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
; GCN-NEXT: v_min_f32_e32 v22, v22, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
; GCN-NEXT: v_min_f32_e32 v21, v21, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
; GCN-NEXT: v_min_f32_e32 v20, v20, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
; GCN-NEXT: v_min_f32_e32 v19, v19, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
; GCN-NEXT: v_min_f32_e32 v18, v18, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
; GCN-NEXT: v_min_f32_e32 v17, v17, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
; GCN-NEXT: v_min_f32_e32 v16, v16, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
; GCN-NEXT: v_min_f32_e32 v15, v15, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
; GCN-NEXT: v_min_f32_e32 v14, v14, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52
; GCN-NEXT: v_min_f32_e32 v13, v13, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
; GCN-NEXT: v_min_f32_e32 v12, v12, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44
; GCN-NEXT: v_min_f32_e32 v11, v11, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
; GCN-NEXT: v_min_f32_e32 v10, v10, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
; GCN-NEXT: v_min_f32_e32 v9, v9, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
; GCN-NEXT: v_min_f32_e32 v8, v8, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28
; GCN-NEXT: v_min_f32_e32 v7, v7, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
; GCN-NEXT: v_min_f32_e32 v6, v6, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
; GCN-NEXT: v_min_f32_e32 v5, v5, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
; GCN-NEXT: v_min_f32_e32 v4, v4, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
; GCN-NEXT: v_min_f32_e32 v3, v3, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GCN-NEXT: v_min_f32_e32 v2, v2, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
; GCN-NEXT: v_min_f32_e32 v1, v1, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_min_f32_e32 v0, v0, v32
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
@@ -19590,322 +19390,258 @@ define <32 x bfloat> @v_minnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_min_f32_e32 v31, v31, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: v_min_f32_e32 v31, v31, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_min_f32_e32 v30, v30, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v29, v29, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v28, v28, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v27, v27, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v26, v26, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v25, v25, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v24, v24, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v23, v23, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v22, v22, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v21, v21, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v20, v20, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v19, v19, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v18, v18, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v17, v17, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v16, v16, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v15, v15, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v14, v14, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v13, v13, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v12, v12, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v11, v11, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v10, v10, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v9, v9, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v8, v8, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v7, v7, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v6, v6, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v5, v5, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v4, v4, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v3, v3, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v2, v2, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v1, v1, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_min_f32_e32 v0, v0, v32
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -21097,8 +20833,6 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_max_f32_e32 v0, v0, v1
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -21110,8 +20844,6 @@ define bfloat @v_maxnum_bf16(bfloat %a, bfloat %b) {
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_max_f32_e32 v0, v0, v1
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -21192,10 +20924,6 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_max_f32_e32 v1, v1, v3
; GCN-NEXT: v_max_f32_e32 v0, v0, v2
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
@@ -21213,10 +20941,6 @@ define <2 x bfloat> @v_maxnum_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_max_f32_e32 v1, v1, v3
; GFX7-NEXT: v_max_f32_e32 v0, v0, v2
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
@@ -21339,12 +21063,6 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_max_f32_e32 v2, v2, v5
; GCN-NEXT: v_max_f32_e32 v1, v1, v4
; GCN-NEXT: v_max_f32_e32 v0, v0, v3
@@ -21368,12 +21086,6 @@ define <3 x bfloat> @v_maxnum_v3bf16(<3 x bfloat> %a, <3 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_max_f32_e32 v2, v2, v5
; GFX7-NEXT: v_max_f32_e32 v1, v1, v4
; GFX7-NEXT: v_max_f32_e32 v0, v0, v3
@@ -21505,14 +21217,6 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_max_f32_e32 v3, v3, v7
; GCN-NEXT: v_max_f32_e32 v2, v2, v6
; GCN-NEXT: v_max_f32_e32 v1, v1, v5
@@ -21542,14 +21246,6 @@ define <4 x bfloat> @v_maxnum_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_max_f32_e32 v3, v3, v7
; GFX7-NEXT: v_max_f32_e32 v2, v2, v6
; GFX7-NEXT: v_max_f32_e32 v1, v1, v5
@@ -21768,22 +21464,6 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_max_f32_e32 v7, v7, v15
; GCN-NEXT: v_max_f32_e32 v6, v6, v14
; GCN-NEXT: v_max_f32_e32 v5, v5, v13
@@ -21837,22 +21517,6 @@ define <8 x bfloat> @v_maxnum_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_max_f32_e32 v7, v7, v15
; GFX7-NEXT: v_max_f32_e32 v6, v6, v14
; GFX7-NEXT: v_max_f32_e32 v5, v5, v13
@@ -22199,71 +21863,51 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_max_f32_e32 v14, v14, v30
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_max_f32_e32 v13, v13, v29
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_max_f32_e32 v12, v12, v28
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_max_f32_e32 v11, v11, v27
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_max_f32_e32 v10, v10, v26
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_max_f32_e32 v9, v9, v25
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_max_f32_e32 v8, v8, v24
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_max_f32_e32 v7, v7, v23
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_max_f32_e32 v6, v6, v22
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_max_f32_e32 v5, v5, v21
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
@@ -22278,8 +21922,6 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_max_f32_e32 v4, v4, v20
; GCN-NEXT: buffer_load_dword v20, off, s[0:3], s32
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
@@ -22291,21 +21933,10 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_max_f32_e32 v3, v3, v19
; GCN-NEXT: v_max_f32_e32 v2, v2, v18
; GCN-NEXT: v_max_f32_e32 v1, v1, v17
; GCN-NEXT: v_max_f32_e32 v0, v0, v16
-; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
@@ -22320,8 +21951,9 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
+; GCN-NEXT: s_waitcnt vmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_max_f32_e32 v15, v15, v16
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
@@ -22330,14 +21962,12 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-LABEL: v_maxnum_v16bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
-; GFX7-NEXT: v_max_f32_e32 v9, v9, v25
-; GFX7-NEXT: buffer_load_dword v25, off, s[0:3], s32
+; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_max_f32_e32 v6, v6, v22
+; GFX7-NEXT: buffer_load_dword v22, off, s[0:3], s32
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
@@ -22348,13 +21978,13 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
+; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
+; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
@@ -22377,13 +22007,13 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
+; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
+; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
@@ -22396,48 +22026,14 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
-; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
-; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
-; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
-; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_max_f32_e32 v14, v14, v30
; GFX7-NEXT: v_max_f32_e32 v13, v13, v29
; GFX7-NEXT: v_max_f32_e32 v12, v12, v28
; GFX7-NEXT: v_max_f32_e32 v11, v11, v27
; GFX7-NEXT: v_max_f32_e32 v10, v10, v26
-; GFX7-NEXT: v_max_f32_e32 v15, v15, v25
+; GFX7-NEXT: v_max_f32_e32 v9, v9, v25
; GFX7-NEXT: v_max_f32_e32 v8, v8, v24
; GFX7-NEXT: v_max_f32_e32 v7, v7, v23
-; GFX7-NEXT: v_max_f32_e32 v6, v6, v22
; GFX7-NEXT: v_max_f32_e32 v5, v5, v21
; GFX7-NEXT: v_max_f32_e32 v4, v4, v20
; GFX7-NEXT: v_max_f32_e32 v3, v3, v19
@@ -22451,6 +22047,10 @@ define <16 x bfloat> @v_maxnum_v16bf16(<16 x bfloat> %a, <16 x bfloat> %b) {
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
+; GFX7-NEXT: s_waitcnt vmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
+; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
+; GFX7-NEXT: v_max_f32_e32 v15, v15, v22
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
@@ -23084,287 +22684,223 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GCN-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GCN-NEXT: v_mul_f32_e32 v31, 1.0, v31
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:124
; GCN-NEXT: v_max_f32_e32 v31, v31, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GCN-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:120
; GCN-NEXT: v_max_f32_e32 v30, v30, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GCN-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:116
; GCN-NEXT: v_max_f32_e32 v29, v29, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GCN-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:112
; GCN-NEXT: v_max_f32_e32 v28, v28, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GCN-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:108
; GCN-NEXT: v_max_f32_e32 v27, v27, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GCN-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:104
; GCN-NEXT: v_max_f32_e32 v26, v26, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GCN-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:100
; GCN-NEXT: v_max_f32_e32 v25, v25, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GCN-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:96
; GCN-NEXT: v_max_f32_e32 v24, v24, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GCN-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:92
; GCN-NEXT: v_max_f32_e32 v23, v23, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GCN-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:88
; GCN-NEXT: v_max_f32_e32 v22, v22, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GCN-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:84
; GCN-NEXT: v_max_f32_e32 v21, v21, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GCN-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:80
; GCN-NEXT: v_max_f32_e32 v20, v20, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GCN-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:76
; GCN-NEXT: v_max_f32_e32 v19, v19, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GCN-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:72
; GCN-NEXT: v_max_f32_e32 v18, v18, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GCN-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:68
; GCN-NEXT: v_max_f32_e32 v17, v17, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GCN-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:64
; GCN-NEXT: v_max_f32_e32 v16, v16, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GCN-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:60
; GCN-NEXT: v_max_f32_e32 v15, v15, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GCN-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:56
; GCN-NEXT: v_max_f32_e32 v14, v14, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GCN-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:52
; GCN-NEXT: v_max_f32_e32 v13, v13, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GCN-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:48
; GCN-NEXT: v_max_f32_e32 v12, v12, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GCN-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:44
; GCN-NEXT: v_max_f32_e32 v11, v11, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GCN-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:40
; GCN-NEXT: v_max_f32_e32 v10, v10, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GCN-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:36
; GCN-NEXT: v_max_f32_e32 v9, v9, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GCN-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:32
; GCN-NEXT: v_max_f32_e32 v8, v8, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GCN-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:28
; GCN-NEXT: v_max_f32_e32 v7, v7, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GCN-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:24
; GCN-NEXT: v_max_f32_e32 v6, v6, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GCN-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:20
; GCN-NEXT: v_max_f32_e32 v5, v5, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GCN-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:16
; GCN-NEXT: v_max_f32_e32 v4, v4, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GCN-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:12
; GCN-NEXT: v_max_f32_e32 v3, v3, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GCN-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:8
; GCN-NEXT: v_max_f32_e32 v2, v2, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GCN-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
; GCN-NEXT: v_max_f32_e32 v1, v1, v32
-; GCN-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: s_waitcnt vmcnt(0)
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v33
; GCN-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GCN-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GCN-NEXT: v_max_f32_e32 v0, v0, v32
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
@@ -23407,322 +22943,258 @@ define <32 x bfloat> @v_maxnum_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:128
; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
-; GFX7-NEXT: v_mul_f32_e32 v30, 1.0, v30
; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
-; GFX7-NEXT: v_mul_f32_e32 v29, 1.0, v29
; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
-; GFX7-NEXT: v_mul_f32_e32 v28, 1.0, v28
; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
-; GFX7-NEXT: v_mul_f32_e32 v27, 1.0, v27
; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
-; GFX7-NEXT: v_mul_f32_e32 v26, 1.0, v26
; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
-; GFX7-NEXT: v_mul_f32_e32 v25, 1.0, v25
; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
-; GFX7-NEXT: v_mul_f32_e32 v24, 1.0, v24
; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
-; GFX7-NEXT: v_mul_f32_e32 v23, 1.0, v23
; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
-; GFX7-NEXT: v_mul_f32_e32 v22, 1.0, v22
; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
-; GFX7-NEXT: v_mul_f32_e32 v21, 1.0, v21
; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
-; GFX7-NEXT: v_mul_f32_e32 v20, 1.0, v20
; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
-; GFX7-NEXT: v_mul_f32_e32 v19, 1.0, v19
; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
-; GFX7-NEXT: v_mul_f32_e32 v18, 1.0, v18
; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
-; GFX7-NEXT: v_mul_f32_e32 v17, 1.0, v17
; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
-; GFX7-NEXT: v_mul_f32_e32 v16, 1.0, v16
; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
-; GFX7-NEXT: v_mul_f32_e32 v15, 1.0, v15
; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
-; GFX7-NEXT: v_mul_f32_e32 v14, 1.0, v14
; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
-; GFX7-NEXT: v_mul_f32_e32 v13, 1.0, v13
; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
-; GFX7-NEXT: v_mul_f32_e32 v12, 1.0, v12
; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
-; GFX7-NEXT: v_mul_f32_e32 v11, 1.0, v11
; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
-; GFX7-NEXT: v_mul_f32_e32 v10, 1.0, v10
; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
-; GFX7-NEXT: v_mul_f32_e32 v9, 1.0, v9
; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
-; GFX7-NEXT: v_mul_f32_e32 v8, 1.0, v8
; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
-; GFX7-NEXT: v_mul_f32_e32 v7, 1.0, v7
; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
-; GFX7-NEXT: v_mul_f32_e32 v6, 1.0, v6
; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
-; GFX7-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; GFX7-NEXT: s_waitcnt vmcnt(1)
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: s_waitcnt vmcnt(0)
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
-; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
-; GFX7-NEXT: v_max_f32_e32 v31, v31, v32
-; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
-; GFX7-NEXT: v_mul_f32_e32 v4, 1.0, v4
; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
-; GFX7-NEXT: v_mul_f32_e32 v3, 1.0, v3
; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
-; GFX7-NEXT: v_mul_f32_e32 v2, 1.0, v2
; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
-; GFX7-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
-; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: s_waitcnt vmcnt(1)
+; GFX7-NEXT: v_mul_f32_e32 v31, 1.0, v31
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: v_max_f32_e32 v31, v31, v32
+; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:124
+; GFX7-NEXT: v_and_b32_e32 v31, 0xffff0000, v31
+; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
+; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
; GFX7-NEXT: v_max_f32_e32 v30, v30, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:120
; GFX7-NEXT: v_and_b32_e32 v30, 0xffff0000, v30
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v29, v29, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:116
; GFX7-NEXT: v_and_b32_e32 v29, 0xffff0000, v29
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v28, v28, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:112
; GFX7-NEXT: v_and_b32_e32 v28, 0xffff0000, v28
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v27, v27, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
; GFX7-NEXT: v_and_b32_e32 v27, 0xffff0000, v27
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v26, v26, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:104
; GFX7-NEXT: v_and_b32_e32 v26, 0xffff0000, v26
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v25, v25, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:100
; GFX7-NEXT: v_and_b32_e32 v25, 0xffff0000, v25
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v24, v24, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:96
; GFX7-NEXT: v_and_b32_e32 v24, 0xffff0000, v24
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v23, v23, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:92
; GFX7-NEXT: v_and_b32_e32 v23, 0xffff0000, v23
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v22, v22, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:88
; GFX7-NEXT: v_and_b32_e32 v22, 0xffff0000, v22
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v21, v21, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:84
; GFX7-NEXT: v_and_b32_e32 v21, 0xffff0000, v21
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v20, v20, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:80
; GFX7-NEXT: v_and_b32_e32 v20, 0xffff0000, v20
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v19, v19, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:76
; GFX7-NEXT: v_and_b32_e32 v19, 0xffff0000, v19
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v18, v18, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:72
; GFX7-NEXT: v_and_b32_e32 v18, 0xffff0000, v18
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v17, v17, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:68
; GFX7-NEXT: v_and_b32_e32 v17, 0xffff0000, v17
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v16, v16, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:64
; GFX7-NEXT: v_and_b32_e32 v16, 0xffff0000, v16
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v15, v15, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:60
; GFX7-NEXT: v_and_b32_e32 v15, 0xffff0000, v15
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v14, v14, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:56
; GFX7-NEXT: v_and_b32_e32 v14, 0xffff0000, v14
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v13, v13, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:52
; GFX7-NEXT: v_and_b32_e32 v13, 0xffff0000, v13
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v12, v12, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:48
; GFX7-NEXT: v_and_b32_e32 v12, 0xffff0000, v12
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v11, v11, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:44
; GFX7-NEXT: v_and_b32_e32 v11, 0xffff0000, v11
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v10, v10, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:40
; GFX7-NEXT: v_and_b32_e32 v10, 0xffff0000, v10
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v9, v9, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:36
; GFX7-NEXT: v_and_b32_e32 v9, 0xffff0000, v9
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v8, v8, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:32
; GFX7-NEXT: v_and_b32_e32 v8, 0xffff0000, v8
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v7, v7, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:28
; GFX7-NEXT: v_and_b32_e32 v7, 0xffff0000, v7
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v6, v6, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:24
; GFX7-NEXT: v_and_b32_e32 v6, 0xffff0000, v6
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v5, v5, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:20
; GFX7-NEXT: v_and_b32_e32 v5, 0xffff0000, v5
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v4, v4, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:16
; GFX7-NEXT: v_and_b32_e32 v4, 0xffff0000, v4
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v3, v3, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:12
; GFX7-NEXT: v_and_b32_e32 v3, 0xffff0000, v3
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v2, v2, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:8
; GFX7-NEXT: v_and_b32_e32 v2, 0xffff0000, v2
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v1, v1, v32
; GFX7-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:4
; GFX7-NEXT: v_and_b32_e32 v1, 0xffff0000, v1
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_and_b32_e32 v32, 0xffff0000, v32
-; GFX7-NEXT: v_mul_f32_e32 v32, 1.0, v32
; GFX7-NEXT: v_max_f32_e32 v0, v0, v32
; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -25176,7 +24648,6 @@ define { bfloat, i16 } @v_frexp_bf16_i16(bfloat %a) {
; GCN-NEXT: v_frexp_exp_i32_f32_e32 v2, v0
; GCN-NEXT: v_cmp_lt_f32_e64 vcc, |v0|, s4
; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc
-; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GCN-NEXT: v_cndmask_b32_e32 v1, 0, v2, vcc
; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
@@ -26818,11 +26289,17 @@ define bfloat @v_canonicalize_bf16(bfloat %a) {
; GCN-LABEL: v_canonicalize_bf16:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GCN-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GCN-NEXT: s_setpc_b64 s[30:31]
;
; GFX7-LABEL: v_canonicalize_bf16:
; GFX7: ; %bb.0:
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0xffff0000, v0
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: v_canonicalize_bf16:
diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll
index dfadd8d..9472845 100644
--- a/llvm/test/CodeGen/AMDGPU/clamp.ll
+++ b/llvm/test/CodeGen/AMDGPU/clamp.ll
@@ -2996,18 +2996,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_elt(ptr addrspace(1) %out, ptr ad
; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2
+; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; GFX6-NEXT: s_endpgm
;
@@ -3095,16 +3093,15 @@ define amdgpu_kernel void @v_clamp_v2f16_not_zero(ptr addrspace(1) %out, ptr add
; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_cvt_f32_f16_e64 v2, v2 clamp
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v3, 2.0, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
+; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp
+; GFX6-NEXT: v_max_f32_e32 v2, 2.0, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; GFX6-NEXT: s_endpgm
;
@@ -3198,9 +3195,8 @@ define amdgpu_kernel void @v_clamp_v2f16_not_one(ptr addrspace(1) %out, ptr addr
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
-; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX6-NEXT: v_med3_f32 v2, v2, 0, 0
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
@@ -3760,19 +3756,17 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts0(ptr addrspace(1) %out
; GFX6-NEXT: s_waitcnt lgkmcnt(0)
; GFX6-NEXT: s_mov_b64 s[4:5], s[2:3]
; GFX6-NEXT: buffer_load_dword v2, v[0:1], s[4:7], 0 addr64
-; GFX6-NEXT: s_mov_b32 s2, 0x7fc00000
; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000
+; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_med3_f32 v3, v3, s2, 1.0
+; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3
+; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
@@ -3863,18 +3857,16 @@ define amdgpu_kernel void @v_clamp_v2f16_undef_limit_elts1(ptr addrspace(1) %out
; GFX6-NEXT: v_mov_b32_e32 v4, 0x7fc00000
; GFX6-NEXT: s_mov_b64 s[2:3], s[6:7]
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v2
-; GFX6-NEXT: v_lshrrev_b32_e32 v2, 16, v2
+; GFX6-NEXT: v_lshrrev_b32_e32 v3, 16, v2
; GFX6-NEXT: v_cvt_f32_f16_e32 v2, v2
-; GFX6-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; GFX6-NEXT: v_max_f32_e32 v3, 0x7fc00000, v3
-; GFX6-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; GFX6-NEXT: v_med3_f32 v2, v2, 0, v4
-; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
-; GFX6-NEXT: v_min_f32_e32 v3, 1.0, v3
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v3
+; GFX6-NEXT: v_max_f32_e32 v2, 0x7fc00000, v2
+; GFX6-NEXT: v_med3_f32 v3, v3, 0, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; GFX6-NEXT: v_or_b32_e32 v2, v3, v2
+; GFX6-NEXT: v_min_f32_e32 v2, 1.0, v2
+; GFX6-NEXT: v_cvt_f16_f32_e32 v2, v2
+; GFX6-NEXT: v_lshlrev_b32_e32 v3, 16, v3
+; GFX6-NEXT: v_or_b32_e32 v2, v2, v3
; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64
; GFX6-NEXT: s_endpgm
;
diff --git a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll
index 2ed6d7f..1c8725f 100644
--- a/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll
+++ b/llvm/test/CodeGen/AMDGPU/convergence-tokens.ll
@@ -1,10 +1,12 @@
; RUN: llc --amdgpu-disable-structurizer -stop-after=amdgpu-isel -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,ISEL %s
; RUN: llc --amdgpu-disable-structurizer -stop-after=dead-mi-elimination -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck --check-prefixes=CHECK,DEADMI %s
+; RUN: llc --amdgpu-disable-structurizer -global-isel -stop-after=irtranslator -mtriple=amdgcn-- -mcpu=gfx900 -verify-machineinstrs -o - %s | FileCheck %s --check-prefixes=CHECK,GISEL
; CHECK-LABEL: name: basic_call
-; CHECK: [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ENTRY
-; ISEL: {{.*}} SI_CALL_ISEL {{.*}}, @foo, [[TOKEN]], csr_amdgpu, {{.*}}
+; CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ENTRY
+; ISEL: {{.*}} SI_CALL_ISEL {{.*}}, @foo, csr_amdgpu, {{.*}}, implicit [[TOKEN]]
; DEADMI: {{.*}} SI_CALL {{.*}}, @foo, csr_amdgpu, {{.*}}, implicit [[TOKEN]]
+; GISEL: {{.*}} G_SI_CALL {{.*}}, @foo, csr_amdgpu, {{.*}}, implicit [[TOKEN]]
define i32 @basic_call(i32 %src) #0 {
%t = call token @llvm.experimental.convergence.entry()
%r = call i32 @foo(i32 %src) [ "convergencectrl"(token %t) ]
@@ -12,10 +14,11 @@ define i32 @basic_call(i32 %src) #0 {
}
; CHECK-LABEL: name: basic_intrinsic
-; CHECK: [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ANCHOR
+; CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR
; ISEL: CONVERGENCECTRL_GLUE [[TOKEN]]
; DEADMI-NOT: CONVERGENCECTRL_GLUE
-; CHECK: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]]
+; ISEL: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]]
+; GISEL: {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane){{.*}}, implicit [[TOKEN]]
define i32 @basic_intrinsic(i32 %src) #0 {
%t = call token @llvm.experimental.convergence.anchor()
%r = call i32 @llvm.amdgcn.readfirstlane(i32 %src) [ "convergencectrl"(token %t) ]
@@ -30,12 +33,13 @@ define i32 @uncontrolled_call(i32 %src) #0 {
}
; CHECK-LABEL: name: basic_branch
-; CHECK: bb.0.entry:
-; CHECK: [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ANCHOR
-; CHECK: bb.1.then:
+; CHECK: bb.[[#]].entry:
+; CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR
+; CHECK: bb.[[#]].then:
; ISEL: CONVERGENCECTRL_GLUE [[TOKEN]]
; DEADMI-NOT: CONVERGENCECTRL_GLUE
-; CHECK: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]]
+; ISEL: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[TOKEN]]
+; GISEL: {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane){{.*}}, implicit [[TOKEN]]
define i32 @basic_branch(i32 %src, i1 %cond) #0 {
entry:
%t = call token @llvm.experimental.convergence.anchor()
@@ -52,12 +56,13 @@ else:
}
; CHECK-LABEL: name: basic_loop
-; CHECK: [[TOKEN:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_ANCHOR
-; CHECK: bb.1.loop:
-; CHECK: [[LOOP:%[0-9]+]]:sreg_64 = CONVERGENCECTRL_LOOP [[TOKEN]]
+; CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR
+; CHECK: bb.[[#]].loop:
+; CHECK: [[LOOP:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_LOOP [[TOKEN]]
; ISEL: CONVERGENCECTRL_GLUE [[LOOP]]
; DEADMI-NOT: CONVERGENCECTRL_GLUE
-; CHECK: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[LOOP]]
+; ISEL: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[LOOP]]
+; GISEL: {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane){{.*}}, implicit [[LOOP]]
define i32 @basic_loop(i32 %src, i1 %cond) #0 {
%t1 = call token @llvm.experimental.convergence.anchor()
br label %loop
@@ -71,6 +76,32 @@ end:
ret i32 %r
}
+; CHECK-LABEL: name: nested
+; CHECK: [[ENTRY:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ENTRY
+; CHECK: [[ANCHOR:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ANCHOR
+; ISEL: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[ANCHOR]]
+; GISEL: {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane){{.*}}, implicit [[ANCHOR]]
+; ISEL: {{.*}} = V_READFIRSTLANE_B32 {{.*}}, implicit [[ENTRY]]
+; GISEL: {{.*}} = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.readfirstlane){{.*}}, implicit [[ENTRY]]
+define i32 @nested(i32 %src) #0 {
+ %t1 = call token @llvm.experimental.convergence.entry()
+ %t2 = call token @llvm.experimental.convergence.anchor()
+ %r2 = call i32 @llvm.amdgcn.readfirstlane(i32 %src) [ "convergencectrl"(token %t2) ]
+ %r1 = call i32 @llvm.amdgcn.readfirstlane(i32 %src) [ "convergencectrl"(token %t1) ]
+ %sum = add i32 %r1, %r2
+ ret i32 %sum
+}
+
+; CHECK-LABEL: name: tail_call_void_func_void
+; CHECK: [[TOKEN:%[0-9]+]]{{[^ ]*}} = CONVERGENCECTRL_ENTRY
+; CHECK: {{.*}} SI_TCRETURN {{.*}}, @external_void_func_void, 0, csr_amdgpu, {{.*}}implicit [[TOKEN]]
+define void @tail_call_void_func_void() #0 {
+ %t1 = call token @llvm.experimental.convergence.entry()
+ tail call void @external_void_func_void() [ "convergencectrl"(token %t1) ]
+ ret void
+}
+
+declare hidden void @external_void_func_void() #0
declare i32 @foo(i32 %x) #0
declare i32 @llvm.amdgcn.readfirstlane(i32) #0
diff --git a/llvm/test/CodeGen/AMDGPU/copy-vgpr-clobber-spill-vgpr.mir b/llvm/test/CodeGen/AMDGPU/copy-vgpr-clobber-spill-vgpr.mir
index 895185c..577d38e 100644
--- a/llvm/test/CodeGen/AMDGPU/copy-vgpr-clobber-spill-vgpr.mir
+++ b/llvm/test/CodeGen/AMDGPU/copy-vgpr-clobber-spill-vgpr.mir
@@ -333,7 +333,7 @@
ret void
}
- attributes #0 = { "amdgpu-waves-per-eu"="4,4" }
+ attributes #0 = { "amdgpu-waves-per-eu"="4,4" "amdgpu-no-agpr" }
...
---
diff --git a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
index 0c03419..386f9cd 100644
--- a/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/direct-indirect-call.ll
@@ -35,6 +35,6 @@ define amdgpu_kernel void @test_direct_indirect_call() {
ret void
}
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR1]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/div_i128.ll b/llvm/test/CodeGen/AMDGPU/div_i128.ll
index 2f3d5d9..cf99b5d 100644
--- a/llvm/test/CodeGen/AMDGPU/div_i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_i128.ll
@@ -1,10 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
-; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9,GFX9-SDAG %s
-; RUN: llc -O0 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0,GFX9-SDAG-O0 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9 %s
+; RUN: llc -O0 -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0 %s
-; FIXME: GlobalISel missing the power-of-2 cases in legalization. https://github.com/llvm/llvm-project/issues/80671
-; xUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9,GFX9 %s
-; xUN: llc -O0 -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-O0,GFX9-O0 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-G %s
+; RUN: llc -O0 -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -o - %s | FileCheck -check-prefixes=GFX9-G-O0 %s
define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-LABEL: v_sdiv_i128_vv:
@@ -1223,6 +1222,1158 @@ define i128 @v_sdiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-G-LABEL: v_sdiv_i128_vv:
+; GFX9-G: ; %bb.0: ; %_udiv-special-cases
+; GFX9-G-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-G-NEXT: v_ashrrev_i32_e32 v16, 31, v3
+; GFX9-G-NEXT: v_xor_b32_e32 v0, v16, v0
+; GFX9-G-NEXT: v_xor_b32_e32 v1, v16, v1
+; GFX9-G-NEXT: v_sub_co_u32_e32 v10, vcc, v0, v16
+; GFX9-G-NEXT: v_xor_b32_e32 v2, v16, v2
+; GFX9-G-NEXT: v_subb_co_u32_e32 v11, vcc, v1, v16, vcc
+; GFX9-G-NEXT: v_ashrrev_i32_e32 v17, 31, v7
+; GFX9-G-NEXT: v_xor_b32_e32 v3, v16, v3
+; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v2, v16, vcc
+; GFX9-G-NEXT: v_subb_co_u32_e32 v13, vcc, v3, v16, vcc
+; GFX9-G-NEXT: v_xor_b32_e32 v0, v17, v4
+; GFX9-G-NEXT: v_xor_b32_e32 v1, v17, v5
+; GFX9-G-NEXT: v_sub_co_u32_e32 v18, vcc, v0, v17
+; GFX9-G-NEXT: v_xor_b32_e32 v2, v17, v6
+; GFX9-G-NEXT: v_subb_co_u32_e32 v19, vcc, v1, v17, vcc
+; GFX9-G-NEXT: v_xor_b32_e32 v3, v17, v7
+; GFX9-G-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v17, vcc
+; GFX9-G-NEXT: v_subb_co_u32_e32 v5, vcc, v3, v17, vcc
+; GFX9-G-NEXT: v_or_b32_e32 v0, v18, v4
+; GFX9-G-NEXT: v_or_b32_e32 v1, v19, v5
+; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX9-G-NEXT: v_or_b32_e32 v0, v10, v12
+; GFX9-G-NEXT: v_or_b32_e32 v1, v11, v13
+; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v18
+; GFX9-G-NEXT: v_ffbh_u32_e32 v0, v19
+; GFX9-G-NEXT: v_add_u32_e32 v1, 32, v1
+; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v4
+; GFX9-G-NEXT: v_min_u32_e32 v0, v0, v1
+; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v5
+; GFX9-G-NEXT: v_add_u32_e32 v2, 32, v2
+; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[4:5]
+; GFX9-G-NEXT: v_add_u32_e32 v0, 64, v0
+; GFX9-G-NEXT: v_min_u32_e32 v1, v1, v2
+; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v10
+; GFX9-G-NEXT: v_cndmask_b32_e64 v0, v1, v0, s[6:7]
+; GFX9-G-NEXT: v_ffbh_u32_e32 v1, v11
+; GFX9-G-NEXT: v_add_u32_e32 v2, 32, v2
+; GFX9-G-NEXT: v_ffbh_u32_e32 v3, v12
+; GFX9-G-NEXT: v_min_u32_e32 v1, v1, v2
+; GFX9-G-NEXT: v_ffbh_u32_e32 v2, v13
+; GFX9-G-NEXT: v_add_u32_e32 v3, 32, v3
+; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[12:13]
+; GFX9-G-NEXT: v_add_u32_e32 v1, 64, v1
+; GFX9-G-NEXT: v_min_u32_e32 v2, v2, v3
+; GFX9-G-NEXT: v_cndmask_b32_e64 v1, v2, v1, s[6:7]
+; GFX9-G-NEXT: v_sub_co_u32_e64 v0, s[6:7], v0, v1
+; GFX9-G-NEXT: v_subb_co_u32_e64 v1, s[6:7], 0, 0, s[6:7]
+; GFX9-G-NEXT: v_mov_b32_e32 v6, 0x7f
+; GFX9-G-NEXT: v_subb_co_u32_e64 v2, s[6:7], 0, 0, s[6:7]
+; GFX9-G-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-G-NEXT: v_subb_co_u32_e64 v3, s[6:7], 0, 0, s[6:7]
+; GFX9-G-NEXT: v_cmp_gt_u64_e64 s[6:7], v[0:1], v[6:7]
+; GFX9-G-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-G-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[6:7]
+; GFX9-G-NEXT: v_cmp_lt_u64_e64 s[6:7], 0, v[2:3]
+; GFX9-G-NEXT: v_or_b32_e32 v15, v1, v3
+; GFX9-G-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[6:7]
+; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3]
+; GFX9-G-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v7, v6, s[6:7]
+; GFX9-G-NEXT: v_cndmask_b32_e64 v7, 0, 1, s[4:5]
+; GFX9-G-NEXT: v_or_b32_e32 v20, v7, v6
+; GFX9-G-NEXT: v_xor_b32_e32 v6, 0x7f, v0
+; GFX9-G-NEXT: v_or_b32_e32 v14, v6, v2
+; GFX9-G-NEXT: v_and_b32_e32 v6, 1, v20
+; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v6
+; GFX9-G-NEXT: v_cndmask_b32_e64 v6, v10, 0, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e64 v7, v11, 0, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v12, 0, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e64 v9, v13, 0, vcc
+; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
+; GFX9-G-NEXT: v_cndmask_b32_e64 v14, 0, 1, vcc
+; GFX9-G-NEXT: v_or_b32_e32 v14, v20, v14
+; GFX9-G-NEXT: v_and_b32_e32 v14, 1, v14
+; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
+; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GFX9-G-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GFX9-G-NEXT: s_cbranch_execz .LBB0_6
+; GFX9-G-NEXT: ; %bb.1: ; %udiv-bb1
+; GFX9-G-NEXT: v_add_co_u32_e32 v20, vcc, 1, v0
+; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v1, vcc
+; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, 0, v2, vcc
+; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, 0, v3, vcc
+; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GFX9-G-NEXT: v_sub_co_u32_e32 v8, vcc, 0x7f, v0
+; GFX9-G-NEXT: v_sub_u32_e32 v0, 64, v8
+; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v0, v[10:11]
+; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v8, v[12:13]
+; GFX9-G-NEXT: v_subrev_u32_e32 v9, 64, v8
+; GFX9-G-NEXT: v_lshlrev_b64 v[6:7], v8, v[10:11]
+; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2
+; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3
+; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], v9, v[10:11]
+; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8
+; GFX9-G-NEXT: v_cndmask_b32_e32 v6, 0, v6, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v7, 0, v7, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
+; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v0, v12, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v1, v13, vcc
+; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GFX9-G-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-G-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-G-NEXT: v_mov_b32_e32 v2, s10
+; GFX9-G-NEXT: v_mov_b32_e32 v3, s11
+; GFX9-G-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GFX9-G-NEXT: s_xor_b64 s[12:13], exec, s[8:9]
+; GFX9-G-NEXT: s_cbranch_execz .LBB0_5
+; GFX9-G-NEXT: ; %bb.2: ; %udiv-preheader
+; GFX9-G-NEXT: v_sub_u32_e32 v2, 64, v20
+; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v20, v[10:11]
+; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], v2, v[12:13]
+; GFX9-G-NEXT: v_subrev_u32_e32 v24, 64, v20
+; GFX9-G-NEXT: v_lshrrev_b64 v[14:15], v20, v[12:13]
+; GFX9-G-NEXT: v_or_b32_e32 v2, v0, v2
+; GFX9-G-NEXT: v_or_b32_e32 v3, v1, v3
+; GFX9-G-NEXT: v_lshrrev_b64 v[0:1], v24, v[12:13]
+; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v20
+; GFX9-G-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v1, v1, v3, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v14, 0, v14, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v15, 0, v15, vcc
+; GFX9-G-NEXT: v_add_co_u32_e32 v24, vcc, -1, v18
+; GFX9-G-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-G-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v20
+; GFX9-G-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v19, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e64 v12, v0, v10, s[4:5]
+; GFX9-G-NEXT: v_cndmask_b32_e64 v13, v1, v11, s[4:5]
+; GFX9-G-NEXT: v_addc_co_u32_e32 v26, vcc, -1, v4, vcc
+; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GFX9-G-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-G-NEXT: v_addc_co_u32_e32 v27, vcc, -1, v5, vcc
+; GFX9-G-NEXT: v_mov_b32_e32 v11, 0
+; GFX9-G-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-G-NEXT: v_mov_b32_e32 v2, s10
+; GFX9-G-NEXT: v_mov_b32_e32 v3, s11
+; GFX9-G-NEXT: .LBB0_3: ; %udiv-do-while
+; GFX9-G-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[6:7]
+; GFX9-G-NEXT: v_lshrrev_b32_e32 v10, 31, v7
+; GFX9-G-NEXT: v_or_b32_e32 v6, v0, v2
+; GFX9-G-NEXT: v_or_b32_e32 v7, v1, v3
+; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[12:13]
+; GFX9-G-NEXT: v_lshrrev_b32_e32 v12, 31, v9
+; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 1, v[14:15]
+; GFX9-G-NEXT: v_or_b32_e32 v2, v2, v12
+; GFX9-G-NEXT: v_lshrrev_b32_e32 v14, 31, v13
+; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, v24, v2
+; GFX9-G-NEXT: v_or_b32_e32 v0, v0, v14
+; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v25, v3, vcc
+; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v26, v0, vcc
+; GFX9-G-NEXT: v_subb_co_u32_e32 v12, vcc, v27, v1, vcc
+; GFX9-G-NEXT: v_ashrrev_i32_e32 v28, 31, v12
+; GFX9-G-NEXT: v_and_b32_e32 v12, v28, v18
+; GFX9-G-NEXT: v_sub_co_u32_e32 v12, vcc, v2, v12
+; GFX9-G-NEXT: v_and_b32_e32 v2, v28, v19
+; GFX9-G-NEXT: v_subb_co_u32_e32 v13, vcc, v3, v2, vcc
+; GFX9-G-NEXT: v_and_b32_e32 v2, v28, v4
+; GFX9-G-NEXT: v_subb_co_u32_e32 v14, vcc, v0, v2, vcc
+; GFX9-G-NEXT: v_and_b32_e32 v0, v28, v5
+; GFX9-G-NEXT: v_subb_co_u32_e32 v15, vcc, v1, v0, vcc
+; GFX9-G-NEXT: v_add_co_u32_e32 v20, vcc, -1, v20
+; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, -1, v21, vcc
+; GFX9-G-NEXT: v_addc_co_u32_e32 v22, vcc, -1, v22, vcc
+; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v23, vcc
+; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9]
+; GFX9-G-NEXT: v_or_b32_e32 v0, v20, v22
+; GFX9-G-NEXT: v_or_b32_e32 v1, v21, v23
+; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v10
+; GFX9-G-NEXT: v_and_b32_e32 v10, 1, v28
+; GFX9-G-NEXT: v_mov_b32_e32 v0, v10
+; GFX9-G-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GFX9-G-NEXT: v_mov_b32_e32 v1, v11
+; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX9-G-NEXT: s_cbranch_execnz .LBB0_3
+; GFX9-G-NEXT: ; %bb.4: ; %Flow
+; GFX9-G-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX9-G-NEXT: .LBB0_5: ; %Flow2
+; GFX9-G-NEXT: s_or_b64 exec, exec, s[12:13]
+; GFX9-G-NEXT: v_lshlrev_b64 v[2:3], 1, v[6:7]
+; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9]
+; GFX9-G-NEXT: v_lshrrev_b32_e32 v4, 31, v7
+; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v4
+; GFX9-G-NEXT: v_or_b32_e32 v6, v0, v2
+; GFX9-G-NEXT: v_or_b32_e32 v7, v1, v3
+; GFX9-G-NEXT: .LBB0_6: ; %Flow3
+; GFX9-G-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-G-NEXT: v_xor_b32_e32 v3, v17, v16
+; GFX9-G-NEXT: v_xor_b32_e32 v0, v6, v3
+; GFX9-G-NEXT: v_xor_b32_e32 v1, v7, v3
+; GFX9-G-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v3
+; GFX9-G-NEXT: v_xor_b32_e32 v2, v8, v3
+; GFX9-G-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v3, vcc
+; GFX9-G-NEXT: v_xor_b32_e32 v4, v9, v3
+; GFX9-G-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v3, vcc
+; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v4, v3, vcc
+; GFX9-G-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-G-O0-LABEL: v_sdiv_i128_vv:
+; GFX9-G-O0: ; %bb.0: ; %_udiv-special-cases
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-G-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v0
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-G-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v2
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v3
+; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v7
+; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v3
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v2
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v1
+; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-G-O0-NEXT: s_mov_b64 s[12:13], 0x7f
+; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr4_vgpr5 killed $exec
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v7
+; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr1 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v2
+; GFX9-G-O0-NEXT: s_mov_b32 s6, 31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s6
+; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v12, v3, v8
+; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr1 killed $exec
+; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
+; GFX9-G-O0-NEXT: s_mov_b32 s6, 31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v10, v1, v2
+; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr13_vgpr14 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v15
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v16
+; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr1 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v2
+; GFX9-G-O0-NEXT: s_mov_b32 s6, 31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s6
+; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v11, v3, v8
+; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr1 killed $exec
+; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr1_vgpr2 killed $exec
+; GFX9-G-O0-NEXT: s_mov_b32 s6, 31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v9, v1, v2
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v2
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-G-O0-NEXT: v_xor_b32_e64 v1, v12, v1
+; GFX9-G-O0-NEXT: v_xor_b32_e64 v4, v12, v2
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-G-O0-NEXT: v_xor_b32_e64 v3, v10, v3
+; GFX9-G-O0-NEXT: v_xor_b32_e64 v2, v10, v2
+; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v1, s[6:7], v1, v12
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v7, s[6:7], v4, v12, s[6:7]
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v6, s[6:7], v3, v10, s[6:7]
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v5, s[6:7], v2, v10, s[6:7]
+; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v13
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v14
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v15
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v16
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v7
+; GFX9-G-O0-NEXT: v_xor_b32_e64 v5, v11, v5
+; GFX9-G-O0-NEXT: v_xor_b32_e64 v8, v11, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v13
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v14
+; GFX9-G-O0-NEXT: v_xor_b32_e64 v7, v9, v7
+; GFX9-G-O0-NEXT: v_xor_b32_e64 v6, v9, v6
+; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v5, s[6:7], v5, v11
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v15, s[6:7], v8, v11, s[6:7]
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v14, s[6:7], v7, v9, s[6:7]
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v13, s[6:7], v6, v9, s[6:7]
+; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v15
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v14
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v13
+; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_xor_b32_e64 v13, v11, v12
+; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_xor_b32_e64 v11, v11, v12
+; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_xor_b32_e64 v11, v9, v10
+; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_xor_b32_e64 v9, v9, v10
+; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v13
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v14
+; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v9, v12
+; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v10, v11
+; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4
+; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[6:7], v[9:10], v[11:12]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v2
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v3
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v13
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v14
+; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v9, v12
+; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v10, v11
+; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4
+; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], v[11:12]
+; GFX9-G-O0-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4
+; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], v[5:6]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v10
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v11
+; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v5, v5
+; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 32
+; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v6, v7
+; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v6
+; GFX9-G-O0-NEXT: s_mov_b32 s10, 64
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s10
+; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v5, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v9
+; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v5, v5
+; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v7, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 32
+; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v7, v8
+; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v7
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9]
+; GFX9-G-O0-NEXT: s_mov_b32 s16, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v3
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4
+; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], v[6:7]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v11
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v12
+; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6
+; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v7, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 32
+; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v7, v8
+; GFX9-G-O0-NEXT: v_min_u32_e64 v6, v6, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s10
+; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v6, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v9
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v10
+; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6
+; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v8, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, 32
+; GFX9-G-O0-NEXT: v_add_u32_e64 v8, v8, v9
+; GFX9-G-O0-NEXT: v_min_u32_e64 v6, v6, v8
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[8:9]
+; GFX9-G-O0-NEXT: s_mov_b32 s15, 0
+; GFX9-G-O0-NEXT: s_mov_b32 s11, 0
+; GFX9-G-O0-NEXT: s_mov_b32 s14, 0
+; GFX9-G-O0-NEXT: s_mov_b32 s10, 0
+; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v6, s[8:9], v5, v6
+; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s16
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s16
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v7, s[8:9], v5, v7, s[8:9]
+; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s15
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s14
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v9, s[8:9], v5, v8, s[8:9]
+; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s11
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s10
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v5, v8, s[8:9]
+; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v9
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s4
+; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[10:11], v[12:13], v[14:15]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s4
+; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[12:13], v[14:15]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s12
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s13
+; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[12:13], v[10:11], v[12:13]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v10, v5, v10, s[12:13]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[10:11]
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v10, v5, v10, s[8:9]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[6:7]
+; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v10
+; GFX9-G-O0-NEXT: s_mov_b32 s7, 0x7f
+; GFX9-G-O0-NEXT: s_mov_b32 s6, 0
+; GFX9-G-O0-NEXT: v_xor_b32_e64 v6, v6, s7
+; GFX9-G-O0-NEXT: v_xor_b32_e64 v7, v7, s6
+; GFX9-G-O0-NEXT: v_or_b32_e64 v6, v6, v9
+; GFX9-G-O0-NEXT: v_or_b32_e64 v8, v7, v8
+; GFX9-G-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s4
+; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[6:7], v[8:9]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v2
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v3
+; GFX9-G-O0-NEXT: v_and_b32_e32 v1, 1, v5
+; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v7
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[6:7]
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[6:7]
+; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-G-O0-NEXT: v_and_b32_e32 v3, 1, v5
+; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v3
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v9
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7]
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[6:7]
+; GFX9-G-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5]
+; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v6
+; GFX9-G-O0-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5
+; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], -1
+; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7]
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-G-O0-NEXT: v_writelane_b32 v0, s4, 0
+; GFX9-G-O0-NEXT: v_writelane_b32 v0, s5, 1
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-G-O0-NEXT: s_cbranch_execz .LBB0_3
+; GFX9-G-O0-NEXT: s_branch .LBB0_8
+; GFX9-G-O0-NEXT: .LBB0_1: ; %Flow
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: v_readlane_b32 s4, v0, 2
+; GFX9-G-O0-NEXT: v_readlane_b32 s5, v0, 3
+; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-G-O0-NEXT: ; %bb.2: ; %Flow
+; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4)
+; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_branch .LBB0_5
+; GFX9-G-O0-NEXT: .LBB0_3: ; %Flow2
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: v_readlane_b32 s4, v4, 0
+; GFX9-G-O0-NEXT: v_readlane_b32 s5, v4, 1
+; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_branch .LBB0_9
+; GFX9-G-O0-NEXT: .LBB0_4: ; %udiv-loop-exit
+; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v7
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[10:11], v0, v[2:3]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[4:5]
+; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr2 killed $exec
+; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v6, v2, v3
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v15
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v17
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v12
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v13
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v10
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v11
+; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v7
+; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v1, v5
+; GFX9-G-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v9
+; GFX9-G-O0-NEXT: v_or3_b32 v4, v4, v6, v7
+; GFX9-G-O0-NEXT: v_or3_b32 v2, v2, v3, v5
+; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v2
+; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_branch .LBB0_3
+; GFX9-G-O0-NEXT: .LBB0_5: ; %Flow1
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: v_readlane_b32 s4, v8, 4
+; GFX9-G-O0-NEXT: v_readlane_b32 s5, v8, 5
+; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_branch .LBB0_4
+; GFX9-G-O0-NEXT: .LBB0_6: ; %udiv-do-while
+; GFX9-G-O0-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: v_readlane_b32 s6, v16, 6
+; GFX9-G-O0-NEXT: v_readlane_b32 s7, v16, 7
+; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16)
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5
+; GFX9-G-O0-NEXT: s_mov_b32 s8, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[21:22], v2, v[0:1]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[3:4]
+; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec
+; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec
+; GFX9-G-O0-NEXT: s_mov_b32 s9, 31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s9
+; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v3, v0, v1
+; GFX9-G-O0-NEXT: s_mov_b32 s9, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v5
+; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v2, v3
+; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v0, v1
+; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr12_vgpr13 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v14
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v15
+; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec
+; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec
+; GFX9-G-O0-NEXT: s_mov_b32 s9, 31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s9
+; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v3, v0, v1
+; GFX9-G-O0-NEXT: s_mov_b32 s9, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v21
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v22
+; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v2, v3
+; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v0, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v13
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v15
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[23:24], v0, v[2:3]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[12:13]
+; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr2 killed $exec
+; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec
+; GFX9-G-O0-NEXT: s_mov_b32 s8, 31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8
+; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v14, v2, v3
+; GFX9-G-O0-NEXT: s_mov_b32 s8, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8)
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v29, v31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v30, v32
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v33
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v34
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v29
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v30
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v23
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v24
+; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v15
+; GFX9-G-O0-NEXT: v_or_b32_e64 v13, v1, v13
+; GFX9-G-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v13
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v21
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v22
+; GFX9-G-O0-NEXT: v_or3_b32 v12, v12, v14, v15
+; GFX9-G-O0-NEXT: v_or3_b32 v2, v2, v3, v13
+; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v2
+; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v13
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v11, s[8:9], v11, v4
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v10, v9, s[8:9]
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v8, v7, s[8:9]
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v6, v5, s[8:9]
+; GFX9-G-O0-NEXT: s_mov_b32 s8, 31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s8
+; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v8, v6, v10
+; GFX9-G-O0-NEXT: s_mov_b32 s8, 31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s8
+; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v6, v6, v10
+; GFX9-G-O0-NEXT: s_mov_b32 s9, 1
+; GFX9-G-O0-NEXT: s_mov_b32 s8, 0
+; GFX9-G-O0-NEXT: v_and_b32_e64 v12, v8, s9
+; GFX9-G-O0-NEXT: v_and_b32_e64 v10, v8, s8
+; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v10
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s4
+; GFX9-G-O0-NEXT: ; kill: def $vgpr12_vgpr13 killed $vgpr12_vgpr13 def $vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v11
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v10
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v25
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v24, v26
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v27
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v28
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v23
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v24
+; GFX9-G-O0-NEXT: v_and_b32_e64 v11, v8, v11
+; GFX9-G-O0-NEXT: v_and_b32_e64 v10, v8, v10
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v21
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v22
+; GFX9-G-O0-NEXT: v_and_b32_e64 v8, v6, v8
+; GFX9-G-O0-NEXT: v_and_b32_e64 v6, v6, v21
+; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v4, s[8:9], v4, v11
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v9, v10, s[8:9]
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v9, s[8:9], v7, v8, s[8:9]
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v5, v6, s[8:9]
+; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v10
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v9
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v17
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v18
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v19
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v20
+; GFX9-G-O0-NEXT: s_mov_b32 s8, -1
+; GFX9-G-O0-NEXT: s_mov_b32 s12, -1
+; GFX9-G-O0-NEXT: s_mov_b32 s11, -1
+; GFX9-G-O0-NEXT: s_mov_b32 s10, -1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, s8
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v17, s[8:9], v11, v17
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s12
+; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v18, s[8:9], v10, v11, s[8:9]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s11
+; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v20, s[8:9], v9, v10, s[8:9]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s10
+; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v19, s[8:9], v8, v9, s[8:9]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v17
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v18
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v20
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v19
+; GFX9-G-O0-NEXT: v_or_b32_e64 v17, v17, v20
+; GFX9-G-O0-NEXT: v_or_b32_e64 v19, v18, v19
+; GFX9-G-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v19
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, s5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, s4
+; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[19:20]
+; GFX9-G-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v3
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v2
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v0
+; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v14
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v13
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v12
+; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 2
+; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 3
+; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 6
+; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 7
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-G-O0-NEXT: s_cbranch_execnz .LBB0_6
+; GFX9-G-O0-NEXT: s_branch .LBB0_1
+; GFX9-G-O0-NEXT: .LBB0_7: ; %udiv-preheader
+; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:320 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:324 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 64
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v13, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4
+; GFX9-G-O0-NEXT: v_sub_u32_e64 v5, v5, v13
+; GFX9-G-O0-NEXT: s_mov_b32 s6, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4
+; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v13, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s6
+; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v13, v6
+; GFX9-G-O0-NEXT: v_lshrrev_b64 v[6:7], v13, v[21:22]
+; GFX9-G-O0-NEXT: v_lshrrev_b64 v[26:27], v13, v[15:16]
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[24:25], v5, v[21:22]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v26
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v27
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v24
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v25
+; GFX9-G-O0-NEXT: v_or_b32_e64 v14, v14, v23
+; GFX9-G-O0-NEXT: v_or_b32_e64 v13, v5, v13
+; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-G-O0-NEXT: v_lshrrev_b64 v[21:22], v4, v[21:22]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v21
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v22
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[4:5]
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v15
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v16
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[6:7]
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v13, v5, v13, s[6:7]
+; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v13
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6
+; GFX9-G-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5]
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5]
+; GFX9-G-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6
+; GFX9-G-O0-NEXT: ; kill: def $vgpr4_vgpr5 killed $vgpr4_vgpr5 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v13
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v14
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v17
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v18
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v19
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v20
+; GFX9-G-O0-NEXT: s_mov_b32 s4, -1
+; GFX9-G-O0-NEXT: s_mov_b32 s10, -1
+; GFX9-G-O0-NEXT: s_mov_b32 s7, -1
+; GFX9-G-O0-NEXT: s_mov_b32 s6, -1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, s4
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v16, s[4:5], v16, v17
+; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, s10
+; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v15, s[4:5], v15, v16, s[4:5]
+; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s7
+; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v14, s[4:5], v14, v15, s[4:5]
+; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s6
+; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v13, s[4:5], v13, v14, s[4:5]
+; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX9-G-O0-NEXT: v_writelane_b32 v12, s8, 6
+; GFX9-G-O0-NEXT: v_writelane_b32 v12, s9, 7
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s4
+; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_branch .LBB0_6
+; GFX9-G-O0-NEXT: .LBB0_8: ; %udiv-bb1
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-G-O0-NEXT: s_mov_b32 s6, 1
+; GFX9-G-O0-NEXT: s_mov_b32 s10, 0
+; GFX9-G-O0-NEXT: s_mov_b32 s9, 0
+; GFX9-G-O0-NEXT: s_mov_b32 s8, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v5, s[6:7], v2, v5
+; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:324 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s10
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1)
+; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v6, s[6:7], v4, v6, s[6:7]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s9
+; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v8, s[6:7], v3, v4, s[6:7]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8
+; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v1, v3, s[6:7]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v7
+; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:320 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_mov_b32 s6, 0x7f
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v4, s[6:7], v1, v2
+; GFX9-G-O0-NEXT: s_mov_b32 s7, 64
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v10
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-G-O0-NEXT: v_sub_u32_e64 v3, v4, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-G-O0-NEXT: v_sub_u32_e64 v9, v1, v4
+; GFX9-G-O0-NEXT: s_mov_b32 s6, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[8:9], v4, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, v1
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[1:2], v4, v[13:14]
+; GFX9-G-O0-NEXT: v_lshrrev_b64 v[18:19], v9, v[13:14]
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[16:17], v4, v[11:12]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v18
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v19
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v16
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v17
+; GFX9-G-O0-NEXT: v_or_b32_e64 v10, v10, v15
+; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v4, v9
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[13:14], v3, v[13:14]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v2
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9]
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[8:9]
+; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v13
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v14
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[8:9]
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[8:9]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v12
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[6:7]
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[6:7]
+; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v3
+; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v9
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v10
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], s[4:5]
+; GFX9-G-O0-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v8
+; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v6, v7
+; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s4
+; GFX9-G-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[5:6], v[7:8]
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s10
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s11
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec
+; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
+; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7]
+; GFX9-G-O0-NEXT: v_writelane_b32 v0, s6, 4
+; GFX9-G-O0-NEXT: v_writelane_b32 v0, s7, 5
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-G-O0-NEXT: s_cbranch_execz .LBB0_5
+; GFX9-G-O0-NEXT: s_branch .LBB0_7
+; GFX9-G-O0-NEXT: .LBB0_9: ; %udiv-end
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[20:21]
+; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v9
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v10
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v11
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v12
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v2
+; GFX9-G-O0-NEXT: v_xor_b32_e64 v0, v0, v8
+; GFX9-G-O0-NEXT: v_xor_b32_e64 v1, v1, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v9
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v10
+; GFX9-G-O0-NEXT: v_xor_b32_e64 v2, v2, v6
+; GFX9-G-O0-NEXT: v_xor_b32_e64 v3, v3, v5
+; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v0, s[4:5], v0, v8
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v1, s[4:5], v1, v7, s[4:5]
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v2, s[4:5], v2, v6, s[4:5]
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v3, s[4:5], v3, v5, s[4:5]
+; GFX9-G-O0-NEXT: ; kill: killed $vgpr4
+; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:328 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_nop 0
+; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:332 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:336 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:340 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:344 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: s_setpc_b64 s[30:31]
%div = sdiv i128 %lhs, %rhs
ret i128 %div
}
@@ -2306,6 +3457,1043 @@ define i128 @v_udiv_i128_vv(i128 %lhs, i128 %rhs) {
; GFX9-O0-NEXT: s_mov_b64 exec, s[4:5]
; GFX9-O0-NEXT: s_waitcnt vmcnt(0)
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-G-LABEL: v_udiv_i128_vv:
+; GFX9-G: ; %bb.0: ; %_udiv-special-cases
+; GFX9-G-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-G-NEXT: v_or_b32_e32 v8, v4, v6
+; GFX9-G-NEXT: v_or_b32_e32 v9, v5, v7
+; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GFX9-G-NEXT: v_or_b32_e32 v8, v0, v2
+; GFX9-G-NEXT: v_or_b32_e32 v9, v1, v3
+; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[8:9]
+; GFX9-G-NEXT: v_ffbh_u32_e32 v9, v4
+; GFX9-G-NEXT: v_ffbh_u32_e32 v8, v5
+; GFX9-G-NEXT: v_add_u32_e32 v9, 32, v9
+; GFX9-G-NEXT: v_ffbh_u32_e32 v10, v6
+; GFX9-G-NEXT: v_min_u32_e32 v8, v8, v9
+; GFX9-G-NEXT: v_ffbh_u32_e32 v9, v7
+; GFX9-G-NEXT: v_add_u32_e32 v10, 32, v10
+; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[6:7]
+; GFX9-G-NEXT: v_add_u32_e32 v8, 64, v8
+; GFX9-G-NEXT: v_min_u32_e32 v9, v9, v10
+; GFX9-G-NEXT: v_ffbh_u32_e32 v10, v0
+; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[6:7]
+; GFX9-G-NEXT: v_ffbh_u32_e32 v9, v1
+; GFX9-G-NEXT: v_add_u32_e32 v10, 32, v10
+; GFX9-G-NEXT: v_ffbh_u32_e32 v11, v2
+; GFX9-G-NEXT: v_min_u32_e32 v9, v9, v10
+; GFX9-G-NEXT: v_ffbh_u32_e32 v10, v3
+; GFX9-G-NEXT: v_add_u32_e32 v11, 32, v11
+; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[2:3]
+; GFX9-G-NEXT: v_add_u32_e32 v9, 64, v9
+; GFX9-G-NEXT: v_min_u32_e32 v10, v10, v11
+; GFX9-G-NEXT: v_cndmask_b32_e64 v9, v10, v9, s[6:7]
+; GFX9-G-NEXT: v_sub_co_u32_e64 v12, s[6:7], v8, v9
+; GFX9-G-NEXT: v_subb_co_u32_e64 v13, s[6:7], 0, 0, s[6:7]
+; GFX9-G-NEXT: v_mov_b32_e32 v8, 0x7f
+; GFX9-G-NEXT: v_subb_co_u32_e64 v14, s[6:7], 0, 0, s[6:7]
+; GFX9-G-NEXT: v_mov_b32_e32 v9, 0
+; GFX9-G-NEXT: v_subb_co_u32_e64 v15, s[6:7], 0, 0, s[6:7]
+; GFX9-G-NEXT: v_cmp_gt_u64_e64 s[6:7], v[12:13], v[8:9]
+; GFX9-G-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-G-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[6:7]
+; GFX9-G-NEXT: v_cmp_lt_u64_e64 s[6:7], 0, v[14:15]
+; GFX9-G-NEXT: v_or_b32_e32 v17, v13, v15
+; GFX9-G-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[6:7]
+; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[6:7], 0, v[14:15]
+; GFX9-G-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v9, v8, s[6:7]
+; GFX9-G-NEXT: v_cndmask_b32_e64 v9, 0, 1, s[4:5]
+; GFX9-G-NEXT: v_or_b32_e32 v18, v9, v8
+; GFX9-G-NEXT: v_xor_b32_e32 v8, 0x7f, v12
+; GFX9-G-NEXT: v_or_b32_e32 v16, v8, v14
+; GFX9-G-NEXT: v_and_b32_e32 v8, 1, v18
+; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v8
+; GFX9-G-NEXT: v_cndmask_b32_e64 v10, v0, 0, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e64 v11, v1, 0, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e64 v8, v2, 0, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e64 v9, v3, 0, vcc
+; GFX9-G-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; GFX9-G-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GFX9-G-NEXT: v_or_b32_e32 v16, v18, v16
+; GFX9-G-NEXT: v_and_b32_e32 v16, 1, v16
+; GFX9-G-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GFX9-G-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GFX9-G-NEXT: s_cbranch_execz .LBB1_6
+; GFX9-G-NEXT: ; %bb.1: ; %udiv-bb1
+; GFX9-G-NEXT: v_add_co_u32_e32 v18, vcc, 1, v12
+; GFX9-G-NEXT: v_addc_co_u32_e32 v19, vcc, 0, v13, vcc
+; GFX9-G-NEXT: v_addc_co_u32_e32 v20, vcc, 0, v14, vcc
+; GFX9-G-NEXT: v_addc_co_u32_e32 v21, vcc, 0, v15, vcc
+; GFX9-G-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GFX9-G-NEXT: v_sub_co_u32_e32 v16, vcc, 0x7f, v12
+; GFX9-G-NEXT: v_sub_u32_e32 v8, 64, v16
+; GFX9-G-NEXT: v_lshrrev_b64 v[8:9], v8, v[0:1]
+; GFX9-G-NEXT: v_lshlrev_b64 v[10:11], v16, v[2:3]
+; GFX9-G-NEXT: v_subrev_u32_e32 v14, 64, v16
+; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], v16, v[0:1]
+; GFX9-G-NEXT: v_or_b32_e32 v10, v8, v10
+; GFX9-G-NEXT: v_or_b32_e32 v11, v9, v11
+; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], v14, v[0:1]
+; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16
+; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GFX9-G-NEXT: v_cndmask_b32_e32 v14, 0, v12, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v15, 0, v13, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
+; GFX9-G-NEXT: v_cmp_eq_u32_e32 vcc, 0, v16
+; GFX9-G-NEXT: v_mov_b32_e32 v13, s11
+; GFX9-G-NEXT: v_cndmask_b32_e32 v8, v8, v2, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v9, v9, v3, vcc
+; GFX9-G-NEXT: v_mov_b32_e32 v11, s9
+; GFX9-G-NEXT: v_mov_b32_e32 v10, s8
+; GFX9-G-NEXT: v_mov_b32_e32 v12, s10
+; GFX9-G-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; GFX9-G-NEXT: s_xor_b64 s[12:13], exec, s[8:9]
+; GFX9-G-NEXT: s_cbranch_execz .LBB1_5
+; GFX9-G-NEXT: ; %bb.2: ; %udiv-preheader
+; GFX9-G-NEXT: v_sub_u32_e32 v12, 64, v18
+; GFX9-G-NEXT: v_subrev_u32_e32 v22, 64, v18
+; GFX9-G-NEXT: v_lshrrev_b64 v[10:11], v18, v[0:1]
+; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], v12, v[2:3]
+; GFX9-G-NEXT: v_lshrrev_b64 v[16:17], v18, v[2:3]
+; GFX9-G-NEXT: v_lshrrev_b64 v[2:3], v22, v[2:3]
+; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v12
+; GFX9-G-NEXT: v_or_b32_e32 v11, v11, v13
+; GFX9-G-NEXT: v_cmp_gt_u32_e32 vcc, 64, v18
+; GFX9-G-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-G-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
+; GFX9-G-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc
+; GFX9-G-NEXT: v_add_co_u32_e32 v22, vcc, -1, v4
+; GFX9-G-NEXT: v_addc_co_u32_e32 v23, vcc, -1, v5, vcc
+; GFX9-G-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GFX9-G-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v18
+; GFX9-G-NEXT: v_addc_co_u32_e32 v24, vcc, -1, v6, vcc
+; GFX9-G-NEXT: v_mov_b32_e32 v13, s11
+; GFX9-G-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5]
+; GFX9-G-NEXT: v_cndmask_b32_e64 v3, v3, v1, s[4:5]
+; GFX9-G-NEXT: v_addc_co_u32_e32 v25, vcc, -1, v7, vcc
+; GFX9-G-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-G-NEXT: v_mov_b32_e32 v11, s9
+; GFX9-G-NEXT: v_mov_b32_e32 v10, s8
+; GFX9-G-NEXT: v_mov_b32_e32 v12, s10
+; GFX9-G-NEXT: .LBB1_3: ; %udiv-do-while
+; GFX9-G-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[14:15]
+; GFX9-G-NEXT: v_lshrrev_b32_e32 v0, 31, v15
+; GFX9-G-NEXT: v_or_b32_e32 v14, v10, v12
+; GFX9-G-NEXT: v_or_b32_e32 v15, v11, v13
+; GFX9-G-NEXT: v_lshlrev_b64 v[12:13], 1, v[16:17]
+; GFX9-G-NEXT: v_lshlrev_b64 v[10:11], 1, v[2:3]
+; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 31, v3
+; GFX9-G-NEXT: v_or_b32_e32 v12, v12, v2
+; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 31, v9
+; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9]
+; GFX9-G-NEXT: v_or_b32_e32 v2, v10, v2
+; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v0
+; GFX9-G-NEXT: v_sub_co_u32_e32 v0, vcc, v22, v2
+; GFX9-G-NEXT: v_subb_co_u32_e32 v0, vcc, v23, v11, vcc
+; GFX9-G-NEXT: v_subb_co_u32_e32 v0, vcc, v24, v12, vcc
+; GFX9-G-NEXT: v_subb_co_u32_e32 v0, vcc, v25, v13, vcc
+; GFX9-G-NEXT: v_add_co_u32_e64 v18, s[4:5], -1, v18
+; GFX9-G-NEXT: v_ashrrev_i32_e32 v3, 31, v0
+; GFX9-G-NEXT: v_addc_co_u32_e64 v19, s[4:5], -1, v19, s[4:5]
+; GFX9-G-NEXT: v_and_b32_e32 v10, v3, v4
+; GFX9-G-NEXT: v_addc_co_u32_e64 v20, s[4:5], -1, v20, s[4:5]
+; GFX9-G-NEXT: v_and_b32_e32 v16, v3, v5
+; GFX9-G-NEXT: v_sub_co_u32_e32 v2, vcc, v2, v10
+; GFX9-G-NEXT: v_addc_co_u32_e64 v21, s[4:5], -1, v21, s[4:5]
+; GFX9-G-NEXT: v_and_b32_e32 v0, 1, v3
+; GFX9-G-NEXT: v_and_b32_e32 v17, v3, v6
+; GFX9-G-NEXT: v_and_b32_e32 v26, v3, v7
+; GFX9-G-NEXT: v_subb_co_u32_e32 v3, vcc, v11, v16, vcc
+; GFX9-G-NEXT: v_or_b32_e32 v10, v18, v20
+; GFX9-G-NEXT: v_or_b32_e32 v11, v19, v21
+; GFX9-G-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
+; GFX9-G-NEXT: v_subb_co_u32_e32 v16, vcc, v12, v17, vcc
+; GFX9-G-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-G-NEXT: v_subb_co_u32_e32 v17, vcc, v13, v26, vcc
+; GFX9-G-NEXT: s_or_b64 s[8:9], s[4:5], s[8:9]
+; GFX9-G-NEXT: v_mov_b32_e32 v10, v0
+; GFX9-G-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GFX9-G-NEXT: s_cbranch_execnz .LBB1_3
+; GFX9-G-NEXT: ; %bb.4: ; %Flow
+; GFX9-G-NEXT: s_or_b64 exec, exec, s[8:9]
+; GFX9-G-NEXT: .LBB1_5: ; %Flow2
+; GFX9-G-NEXT: s_or_b64 exec, exec, s[12:13]
+; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 1, v[14:15]
+; GFX9-G-NEXT: v_lshlrev_b64 v[8:9], 1, v[8:9]
+; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 31, v15
+; GFX9-G-NEXT: v_or_b32_e32 v8, v8, v2
+; GFX9-G-NEXT: v_or_b32_e32 v10, v10, v0
+; GFX9-G-NEXT: v_or_b32_e32 v11, v11, v1
+; GFX9-G-NEXT: .LBB1_6: ; %Flow3
+; GFX9-G-NEXT: s_or_b64 exec, exec, s[6:7]
+; GFX9-G-NEXT: v_mov_b32_e32 v0, v10
+; GFX9-G-NEXT: v_mov_b32_e32 v1, v11
+; GFX9-G-NEXT: v_mov_b32_e32 v2, v8
+; GFX9-G-NEXT: v_mov_b32_e32 v3, v9
+; GFX9-G-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-G-O0-LABEL: v_udiv_i128_vv:
+; GFX9-G-O0: ; %bb.0: ; %_udiv-special-cases
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:316 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-G-O0-NEXT: ; implicit-def: $vgpr8 : SGPR spill to VGPR lane
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v0
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19]
+; GFX9-G-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v2
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v3
+; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v4
+; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_nop 0
+; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v5
+; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v7
+; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6_vgpr7_vgpr8 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v11
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v10
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v9
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v13
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v14
+; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v9, v12
+; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v10, v11
+; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4
+; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[6:7], v[9:10], v[11:12]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v2
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v3
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v10
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v13
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v14
+; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v9, v12
+; GFX9-G-O0-NEXT: v_or_b32_e64 v11, v10, v11
+; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s4
+; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], v[11:12]
+; GFX9-G-O0-NEXT: s_or_b64 s[6:7], s[6:7], s[8:9]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4
+; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[8:9], v[5:6]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v10
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v11
+; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v5, v5
+; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 32
+; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v6, v7
+; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v6
+; GFX9-G-O0-NEXT: s_mov_b32 s10, 64
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s10
+; GFX9-G-O0-NEXT: v_add_u32_e64 v6, v5, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v9
+; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v5, v5
+; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v7, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 32
+; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v7, v8
+; GFX9-G-O0-NEXT: v_min_u32_e64 v5, v5, v7
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v6, s[8:9]
+; GFX9-G-O0-NEXT: s_mov_b32 s14, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v2
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v3
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4
+; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[9:10], v[6:7]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v11
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v12
+; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6
+; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v7, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, 32
+; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v7, v8
+; GFX9-G-O0-NEXT: v_min_u32_e64 v6, v6, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s10
+; GFX9-G-O0-NEXT: v_add_u32_e64 v7, v6, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v9
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v10
+; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v6, v6
+; GFX9-G-O0-NEXT: v_ffbh_u32_e64 v8, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, 32
+; GFX9-G-O0-NEXT: v_add_u32_e64 v8, v8, v9
+; GFX9-G-O0-NEXT: v_min_u32_e64 v6, v6, v8
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[8:9]
+; GFX9-G-O0-NEXT: s_mov_b32 s13, 0
+; GFX9-G-O0-NEXT: s_mov_b32 s11, 0
+; GFX9-G-O0-NEXT: s_mov_b32 s12, 0
+; GFX9-G-O0-NEXT: s_mov_b32 s10, 0
+; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v6, s[8:9], v5, v6
+; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s14
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s14
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v7, s[8:9], v5, v7, s[8:9]
+; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s13
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s12
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v9, s[8:9], v5, v8, s[8:9]
+; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s11
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s10
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v5, v8, s[8:9]
+; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_mov_b64 s[12:13], 0x7f
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v9
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s4
+; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[10:11], v[12:13], v[14:15]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s4
+; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[8:9], v[12:13], v[14:15]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s12
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s13
+; GFX9-G-O0-NEXT: v_cmp_gt_u64_e64 s[12:13], v[10:11], v[12:13]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v10, v5, v10, s[12:13]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[10:11]
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v10, v5, v10, s[8:9]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v11, s[6:7]
+; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v10
+; GFX9-G-O0-NEXT: s_mov_b32 s7, 0x7f
+; GFX9-G-O0-NEXT: s_mov_b32 s6, 0
+; GFX9-G-O0-NEXT: v_xor_b32_e64 v6, v6, s7
+; GFX9-G-O0-NEXT: v_xor_b32_e64 v7, v7, s6
+; GFX9-G-O0-NEXT: v_or_b32_e64 v6, v6, v9
+; GFX9-G-O0-NEXT: v_or_b32_e64 v8, v7, v8
+; GFX9-G-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s4
+; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[6:7], v[8:9]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v2
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v3
+; GFX9-G-O0-NEXT: v_and_b32_e32 v1, 1, v5
+; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v7
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[6:7]
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[6:7]
+; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-G-O0-NEXT: v_and_b32_e32 v3, 1, v5
+; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v3
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v9
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[6:7]
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[6:7]
+; GFX9-G-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v3
+; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5]
+; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v6
+; GFX9-G-O0-NEXT: v_and_b32_e32 v5, 1, v5
+; GFX9-G-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], 0, v5
+; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], -1
+; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7]
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], exec
+; GFX9-G-O0-NEXT: v_writelane_b32 v0, s4, 0
+; GFX9-G-O0-NEXT: v_writelane_b32 v0, s5, 1
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19]
+; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-G-O0-NEXT: s_cbranch_execz .LBB1_3
+; GFX9-G-O0-NEXT: s_branch .LBB1_8
+; GFX9-G-O0-NEXT: .LBB1_1: ; %Flow
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19]
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: v_readlane_b32 s4, v0, 2
+; GFX9-G-O0-NEXT: v_readlane_b32 s5, v0, 3
+; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-G-O0-NEXT: ; %bb.2: ; %Flow
+; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(4)
+; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_branch .LBB1_5
+; GFX9-G-O0-NEXT: .LBB1_3: ; %Flow2
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19]
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: v_readlane_b32 s4, v4, 0
+; GFX9-G-O0-NEXT: v_readlane_b32 s5, v4, 1
+; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_branch .LBB1_9
+; GFX9-G-O0-NEXT: .LBB1_4: ; %udiv-loop-exit
+; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v7
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[10:11], v0, v[2:3]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[4:5]
+; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr2 killed $exec
+; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v6, v2, v3
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v15
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v16
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v17
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v12
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v13
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v10
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v11
+; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v7
+; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v1, v5
+; GFX9-G-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v9
+; GFX9-G-O0-NEXT: v_or3_b32 v4, v4, v6, v7
+; GFX9-G-O0-NEXT: v_or3_b32 v2, v2, v3, v5
+; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v2
+; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_branch .LBB1_3
+; GFX9-G-O0-NEXT: .LBB1_5: ; %Flow1
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19]
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: v_readlane_b32 s4, v8, 4
+; GFX9-G-O0-NEXT: v_readlane_b32 s5, v8, 5
+; GFX9-G-O0-NEXT: s_or_b64 exec, exec, s[4:5]
+; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_branch .LBB1_4
+; GFX9-G-O0-NEXT: .LBB1_6: ; %udiv-do-while
+; GFX9-G-O0-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19]
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: v_readlane_b32 s6, v16, 6
+; GFX9-G-O0-NEXT: v_readlane_b32 s7, v16, 7
+; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:204 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:208 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:212 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:216 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:220 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:224 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:228 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:232 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:244 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:248 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:252 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:256 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:260 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(16)
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v2
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v3
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5
+; GFX9-G-O0-NEXT: s_mov_b32 s8, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[21:22], v2, v[0:1]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[4:5], v2, v[3:4]
+; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec
+; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec
+; GFX9-G-O0-NEXT: s_mov_b32 s9, 31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s9
+; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v3, v0, v1
+; GFX9-G-O0-NEXT: s_mov_b32 s9, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v5
+; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v2, v3
+; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v0, v1
+; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr12_vgpr13 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v14
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v15
+; GFX9-G-O0-NEXT: ; kill: def $vgpr2 killed $vgpr0 killed $exec
+; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr0_vgpr1 killed $exec
+; GFX9-G-O0-NEXT: s_mov_b32 s9, 31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s9
+; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v3, v0, v1
+; GFX9-G-O0-NEXT: s_mov_b32 s9, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s9
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v21
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v22
+; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v2, v3
+; GFX9-G-O0-NEXT: v_or_b32_e64 v9, v0, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v13
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v14
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v15
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[23:24], v0, v[2:3]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[0:1], v0, v[12:13]
+; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr2 killed $exec
+; GFX9-G-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr2_vgpr3 killed $exec
+; GFX9-G-O0-NEXT: s_mov_b32 s8, 31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s8
+; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v14, v2, v3
+; GFX9-G-O0-NEXT: s_mov_b32 s8, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, v0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v1
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(8)
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v29, v31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v30, v32
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v33
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v34
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v29
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v30
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v23
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v24
+; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v15
+; GFX9-G-O0-NEXT: v_or_b32_e64 v13, v1, v13
+; GFX9-G-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v13
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v21
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v22
+; GFX9-G-O0-NEXT: v_or3_b32 v12, v12, v14, v15
+; GFX9-G-O0-NEXT: v_or3_b32 v2, v2, v3, v13
+; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v2
+; GFX9-G-O0-NEXT: ; kill: def $vgpr0_vgpr1 killed $vgpr0_vgpr1 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v12
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v13
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v11, s[8:9], v11, v4
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v10, v9, s[8:9]
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v8, v7, s[8:9]
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v6, v5, s[8:9]
+; GFX9-G-O0-NEXT: s_mov_b32 s8, 31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s8
+; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v8, v6, v10
+; GFX9-G-O0-NEXT: s_mov_b32 s8, 31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s8
+; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v6, v6, v10
+; GFX9-G-O0-NEXT: s_mov_b32 s9, 1
+; GFX9-G-O0-NEXT: s_mov_b32 s8, 0
+; GFX9-G-O0-NEXT: v_and_b32_e64 v12, v8, s9
+; GFX9-G-O0-NEXT: v_and_b32_e64 v10, v8, s8
+; GFX9-G-O0-NEXT: ; kill: def $vgpr12 killed $vgpr12 def $vgpr12_vgpr13 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v10
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s4
+; GFX9-G-O0-NEXT: ; kill: def $vgpr12_vgpr13 killed $vgpr12_vgpr13 def $vgpr12_vgpr13_vgpr14_vgpr15 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v11
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v10
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v25
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v24, v26
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v27
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v28
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v23
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v24
+; GFX9-G-O0-NEXT: v_and_b32_e64 v11, v8, v11
+; GFX9-G-O0-NEXT: v_and_b32_e64 v10, v8, v10
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v21
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v22
+; GFX9-G-O0-NEXT: v_and_b32_e64 v8, v6, v8
+; GFX9-G-O0-NEXT: v_and_b32_e64 v6, v6, v21
+; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v4, s[8:9], v4, v11
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v10, s[8:9], v9, v10, s[8:9]
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v9, s[8:9], v7, v8, s[8:9]
+; GFX9-G-O0-NEXT: v_subb_co_u32_e64 v8, s[8:9], v5, v6, s[8:9]
+; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v10
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v9
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v17
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v18
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v19
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v20
+; GFX9-G-O0-NEXT: s_mov_b32 s8, -1
+; GFX9-G-O0-NEXT: s_mov_b32 s12, -1
+; GFX9-G-O0-NEXT: s_mov_b32 s11, -1
+; GFX9-G-O0-NEXT: s_mov_b32 s10, -1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, s8
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v17, s[8:9], v11, v17
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, s12
+; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v18, s[8:9], v10, v11, s[8:9]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, s11
+; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v20, s[8:9], v9, v10, s[8:9]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, s10
+; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v19, s[8:9], v8, v9, s[8:9]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, v17
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v18
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v20
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v11, v19
+; GFX9-G-O0-NEXT: v_or_b32_e64 v17, v17, v20
+; GFX9-G-O0-NEXT: v_or_b32_e64 v19, v18, v19
+; GFX9-G-O0-NEXT: ; kill: def $vgpr17 killed $vgpr17 def $vgpr17_vgpr18 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v19
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, s5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, s4
+; GFX9-G-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], v[17:18], v[19:20]
+; GFX9-G-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v3
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v2
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v0
+; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v20, v15
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v19, v14
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v18, v13
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, v12
+; GFX9-G-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v18, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v19, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v20, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 2
+; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 3
+; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GFX9-G-O0-NEXT: v_writelane_b32 v16, s6, 6
+; GFX9-G-O0-NEXT: v_writelane_b32 v16, s7, 7
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19]
+; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX9-G-O0-NEXT: s_cbranch_execnz .LBB1_6
+; GFX9-G-O0-NEXT: s_branch .LBB1_1
+; GFX9-G-O0-NEXT: .LBB1_7: ; %udiv-preheader
+; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:264 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:268 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:272 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:276 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:280 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:284 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:288 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:292 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19]
+; GFX9-G-O0-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:296 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 64
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v22, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v21, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s4
+; GFX9-G-O0-NEXT: v_sub_u32_e64 v4, v13, v4
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s4
+; GFX9-G-O0-NEXT: v_sub_u32_e64 v5, v5, v13
+; GFX9-G-O0-NEXT: s_mov_b32 s6, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s4
+; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[4:5], v13, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s6
+; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v13, v6
+; GFX9-G-O0-NEXT: v_lshrrev_b64 v[6:7], v13, v[21:22]
+; GFX9-G-O0-NEXT: v_lshrrev_b64 v[26:27], v13, v[15:16]
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[24:25], v5, v[21:22]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v26
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v27
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v23, v24
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v25
+; GFX9-G-O0-NEXT: v_or_b32_e64 v14, v14, v23
+; GFX9-G-O0-NEXT: v_or_b32_e64 v13, v5, v13
+; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], 0
+; GFX9-G-O0-NEXT: v_lshrrev_b64 v[21:22], v4, v[21:22]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v21
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v22
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[4:5]
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v5, v5, v13, s[4:5]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v15
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v16
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v4, v4, v14, s[6:7]
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v13, v5, v13, s[6:7]
+; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v13
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6
+; GFX9-G-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr6_vgpr7 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v13, v13, v14, s[4:5]
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v6, v6, v7, s[4:5]
+; GFX9-G-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6
+; GFX9-G-O0-NEXT: ; kill: def $vgpr4_vgpr5 killed $vgpr4_vgpr5 def $vgpr4_vgpr5_vgpr6_vgpr7 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v13
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, v14
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v17
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v18
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v19
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v20
+; GFX9-G-O0-NEXT: s_mov_b32 s4, -1
+; GFX9-G-O0-NEXT: s_mov_b32 s10, -1
+; GFX9-G-O0-NEXT: s_mov_b32 s7, -1
+; GFX9-G-O0-NEXT: s_mov_b32 s6, -1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v17, s4
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v16, s[4:5], v16, v17
+; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:260 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, s10
+; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v15, s[4:5], v15, v16, s[4:5]
+; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:256 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s7
+; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v14, s[4:5], v14, v15, s[4:5]
+; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:252 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s6
+; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v13, s[4:5], v13, v14, s[4:5]
+; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:248 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], s[8:9]
+; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], s[8:9]
+; GFX9-G-O0-NEXT: v_writelane_b32 v12, s8, 6
+; GFX9-G-O0-NEXT: v_writelane_b32 v12, s9, 7
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, s7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, s6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, s5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v12, s4
+; GFX9-G-O0-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:232 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:236 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:244 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:216 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:220 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:224 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:228 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:204 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:208 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:212 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_branch .LBB1_6
+; GFX9-G-O0-NEXT: .LBB1_8: ; %udiv-bb1
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19]
+; GFX9-G-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 s[4:5], 0
+; GFX9-G-O0-NEXT: s_mov_b32 s6, 1
+; GFX9-G-O0-NEXT: s_mov_b32 s10, 0
+; GFX9-G-O0-NEXT: s_mov_b32 s9, 0
+; GFX9-G-O0-NEXT: s_mov_b32 s8, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, s6
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(3)
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v5, s[6:7], v2, v5
+; GFX9-G-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:296 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, s10
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(1)
+; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v6, s[6:7], v4, v6, s[6:7]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s9
+; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v8, s[6:7], v3, v4, s[6:7]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s8
+; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v7, s[6:7], v1, v3, s[6:7]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v16, v7
+; GFX9-G-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:280 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:284 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:288 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:292 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_mov_b32 s6, 0x7f
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-G-O0-NEXT: v_sub_co_u32_e64 v4, s[6:7], v1, v2
+; GFX9-G-O0-NEXT: s_mov_b32 s7, 64
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v14, v10
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v13, v9
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-G-O0-NEXT: v_sub_u32_e64 v3, v4, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-G-O0-NEXT: v_sub_u32_e64 v9, v1, v4
+; GFX9-G-O0-NEXT: s_mov_b32 s6, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s7
+; GFX9-G-O0-NEXT: v_cmp_lt_u32_e64 s[8:9], v4, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s6
+; GFX9-G-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v4, v1
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[1:2], v4, v[13:14]
+; GFX9-G-O0-NEXT: v_lshrrev_b64 v[18:19], v9, v[13:14]
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[16:17], v4, v[11:12]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v18
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v19
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v15, v16
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v17
+; GFX9-G-O0-NEXT: v_or_b32_e64 v10, v10, v15
+; GFX9-G-O0-NEXT: v_or_b32_e64 v4, v4, v9
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[13:14], v3, v[13:14]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v2
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, 0
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9]
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v2, v3, s[8:9]
+; GFX9-G-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v3
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v9, v13
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v14
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[8:9]
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[8:9]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v11
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v12
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v9, v9, v10, s[6:7]
+; GFX9-G-O0-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[6:7]
+; GFX9-G-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v10, v3
+; GFX9-G-O0-NEXT: ; kill: def $vgpr1_vgpr2 killed $vgpr1_vgpr2 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v9
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v10
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:264 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:268 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:272 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:276 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_mov_b64 s[8:9], s[4:5]
+; GFX9-G-O0-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX9-G-O0-NEXT: v_or_b32_e64 v5, v5, v8
+; GFX9-G-O0-NEXT: v_or_b32_e64 v7, v6, v7
+; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v8, s5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v7, s4
+; GFX9-G-O0-NEXT: v_cmp_ne_u64_e64 s[4:5], v[5:6], v[7:8]
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s8
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s9
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s10
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, s11
+; GFX9-G-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_mov_b64 s[6:7], exec
+; GFX9-G-O0-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
+; GFX9-G-O0-NEXT: s_xor_b64 s[6:7], s[4:5], s[6:7]
+; GFX9-G-O0-NEXT: v_writelane_b32 v0, s6, 4
+; GFX9-G-O0-NEXT: v_writelane_b32 v0, s7, 5
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GFX9-G-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19]
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-G-O0-NEXT: s_cbranch_execz .LBB1_5
+; GFX9-G-O0-NEXT: s_branch .LBB1_7
+; GFX9-G-O0-NEXT: .LBB1_9: ; %udiv-end
+; GFX9-G-O0-NEXT: s_or_saveexec_b64 s[18:19], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[18:19]
+; GFX9-G-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v7
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v8
+; GFX9-G-O0-NEXT: ; kill: killed $vgpr4
+; GFX9-G-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1
+; GFX9-G-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:300 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_nop 0
+; GFX9-G-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:304 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:308 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:312 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:316 ; 4-byte Folded Reload
+; GFX9-G-O0-NEXT: s_mov_b64 exec, s[4:5]
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0)
+; GFX9-G-O0-NEXT: s_setpc_b64 s[30:31]
%div = udiv i128 %lhs, %rhs
ret i128 %div
}
@@ -2388,6 +4576,66 @@ define i128 @v_sdiv_i128_v_pow2k(i128 %lhs) {
; GFX9-O0-NEXT: v_lshrrev_b64 v[3:4], s4, v[3:4]
; GFX9-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-G-LABEL: v_sdiv_i128_v_pow2k:
+; GFX9-G: ; %bb.0:
+; GFX9-G-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-G-NEXT: v_ashrrev_i32_e32 v4, 31, v3
+; GFX9-G-NEXT: v_mov_b32_e32 v5, v4
+; GFX9-G-NEXT: v_lshrrev_b64 v[4:5], 31, v[4:5]
+; GFX9-G-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4
+; GFX9-G-NEXT: v_addc_co_u32_e32 v4, vcc, v1, v5, vcc
+; GFX9-G-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v2, vcc
+; GFX9-G-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v3, vcc
+; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 31, v[1:2]
+; GFX9-G-NEXT: v_lshrrev_b32_e32 v3, 1, v4
+; GFX9-G-NEXT: v_or_b32_e32 v0, v3, v0
+; GFX9-G-NEXT: v_ashrrev_i32_e32 v3, 31, v2
+; GFX9-G-NEXT: v_ashrrev_i32_e32 v2, 1, v2
+; GFX9-G-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-G-O0-LABEL: v_sdiv_i128_v_pow2k:
+; GFX9-G-O0: ; %bb.0:
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v0
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v0, v0, v3
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v0
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-G-O0-NEXT: v_lshrrev_b64 v[6:7], v0, v[5:6]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v6
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, v7
+; GFX9-G-O0-NEXT: s_mov_b32 s8, 0
+; GFX9-G-O0-NEXT: s_mov_b32 s5, 0
+; GFX9-G-O0-NEXT: v_add_co_u32_e64 v4, s[6:7], v4, v5
+; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v1, s[6:7], v1, v0, s[6:7]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s8
+; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v5, s[6:7], v2, v0, s[6:7]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-G-O0-NEXT: v_addc_co_u32_e64 v4, s[6:7], v3, v0, s[6:7]
+; GFX9-G-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v6, v4
+; GFX9-G-O0-NEXT: s_mov_b32 s5, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s5
+; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v0, v0, v1
+; GFX9-G-O0-NEXT: s_mov_b32 s5, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[5:6], v2, v[5:6]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v3
+; GFX9-G-O0-NEXT: v_or_b32_e64 v1, v1, v2
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v3, v2, v4
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-G-O0-NEXT: v_ashrrev_i32_e64 v2, v2, v4
+; GFX9-G-O0-NEXT: s_setpc_b64 s[30:31]
%div = sdiv i128 %lhs, 8589934592
ret i128 %div
}
@@ -2434,10 +4682,42 @@ define i128 @v_udiv_i128_v_pow2k(i128 %lhs) {
; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec
; GFX9-O0-NEXT: v_mov_b32_e32 v3, 0
; GFX9-O0-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-G-LABEL: v_udiv_i128_v_pow2k:
+; GFX9-G: ; %bb.0:
+; GFX9-G-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-G-NEXT: v_mov_b32_e32 v4, v1
+; GFX9-G-NEXT: v_lshlrev_b64 v[0:1], 31, v[2:3]
+; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 1, v4
+; GFX9-G-NEXT: v_or_b32_e32 v0, v2, v0
+; GFX9-G-NEXT: v_lshrrev_b32_e32 v2, 1, v3
+; GFX9-G-NEXT: v_mov_b32_e32 v3, 0
+; GFX9-G-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-G-O0-LABEL: v_udiv_i128_v_pow2k:
+; GFX9-G-O0: ; %bb.0:
+; GFX9-G-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-G-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v0, s4
+; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v0, v0, v1
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v1, s4
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 31
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-G-O0-NEXT: v_lshlrev_b64 v[5:6], v2, v[4:5]
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v4, v5
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, v6
+; GFX9-G-O0-NEXT: v_or_b32_e64 v0, v0, v4
+; GFX9-G-O0-NEXT: v_or_b32_e64 v1, v1, v2
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 1
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-G-O0-NEXT: v_lshrrev_b32_e64 v2, v2, v3
+; GFX9-G-O0-NEXT: s_mov_b32 s4, 0
+; GFX9-G-O0-NEXT: v_mov_b32_e32 v3, s4
+; GFX9-G-O0-NEXT: s_setpc_b64 s[30:31]
%div = udiv i128 %lhs, 8589934592
ret i128 %div
}
-
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; GFX9-SDAG: {{.*}}
-; GFX9-SDAG-O0: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
index 46e2632..16a03ba 100644
--- a/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/div_v2i128.ll
@@ -1,25 +1,3248 @@
-; RUN: not --crash llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=SDAG-ERR %s
-; RUN: not --crash llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s 2>&1 | FileCheck -check-prefix=GISEL-ERR %s
-
-; SDAG-ERR: LLVM ERROR: unsupported libcall legalization
-; GISEL-ERR: LLVM ERROR: unable to legalize instruction: %{{[0-9]+}}:_(s128) = G_SDIV %{{[0-9]+}}:_, %{{[0-9]+}}:_ (in function: v_sdiv_v2i128_vv)
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s | FileCheck -check-prefix=SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -verify-machineinstrs -o - %s | FileCheck -check-prefix=GISEL %s
define <2 x i128> @v_sdiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
+; SDAG-LABEL: v_sdiv_v2i128_vv:
+; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_ashrrev_i32_e32 v24, 31, v3
+; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v11
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
+; SDAG-NEXT: v_mov_b32_e32 v26, v24
+; SDAG-NEXT: v_mov_b32_e32 v27, v25
+; SDAG-NEXT: v_xor_b32_e32 v17, v24, v3
+; SDAG-NEXT: v_xor_b32_e32 v18, v24, v2
+; SDAG-NEXT: v_xor_b32_e32 v1, v24, v1
+; SDAG-NEXT: v_xor_b32_e32 v0, v24, v0
+; SDAG-NEXT: v_xor_b32_e32 v19, v25, v11
+; SDAG-NEXT: v_xor_b32_e32 v20, v25, v10
+; SDAG-NEXT: v_xor_b32_e32 v9, v25, v9
+; SDAG-NEXT: v_xor_b32_e32 v8, v25, v8
+; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v0, v24
+; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v1, v24, vcc
+; SDAG-NEXT: v_ffbh_u32_e32 v0, v2
+; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v18, v24, vcc
+; SDAG-NEXT: v_add_i32_e64 v1, s[4:5], 32, v0
+; SDAG-NEXT: v_ffbh_u32_e32 v18, v3
+; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v17, v24, vcc
+; SDAG-NEXT: v_or_b32_e32 v0, v2, v10
+; SDAG-NEXT: v_ffbh_u32_e32 v17, v10
+; SDAG-NEXT: v_min_u32_e32 v18, v1, v18
+; SDAG-NEXT: v_sub_i32_e32 v28, vcc, v8, v25
+; SDAG-NEXT: v_or_b32_e32 v1, v3, v11
+; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], 32, v17
+; SDAG-NEXT: v_ffbh_u32_e32 v17, v11
+; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 64, v18
+; SDAG-NEXT: v_addc_u32_e64 v21, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v9, v25, vcc
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; SDAG-NEXT: v_ffbh_u32_e32 v1, v28
+; SDAG-NEXT: v_min_u32_e32 v8, v8, v17
+; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[10:11]
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, s[6:7]
+; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v20, v25, vcc
+; SDAG-NEXT: v_add_i32_e64 v9, s[8:9], 32, v1
+; SDAG-NEXT: v_ffbh_u32_e32 v20, v29
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v8, s[6:7]
+; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v19, v25, vcc
+; SDAG-NEXT: v_or_b32_e32 v8, v28, v0
+; SDAG-NEXT: v_ffbh_u32_e32 v19, v0
+; SDAG-NEXT: v_min_u32_e32 v20, v9, v20
+; SDAG-NEXT: v_or_b32_e32 v9, v29, v1
+; SDAG-NEXT: v_add_i32_e32 v19, vcc, 32, v19
+; SDAG-NEXT: v_ffbh_u32_e32 v21, v1
+; SDAG-NEXT: v_add_i32_e32 v20, vcc, 64, v20
+; SDAG-NEXT: v_addc_u32_e64 v22, s[6:7], 0, 0, vcc
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT: v_min_u32_e32 v8, v19, v21
+; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v22, 0, s[6:7]
+; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v20, v8, s[6:7]
+; SDAG-NEXT: v_sub_i32_e32 v8, vcc, v8, v18
+; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v9, v17, vcc
+; SDAG-NEXT: v_xor_b32_e32 v17, 0x7f, v8
+; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v16, vcc
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[8:9]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v16, vcc
+; SDAG-NEXT: v_or_b32_e32 v16, v17, v18
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v17, v9, v19
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT: v_and_b32_e32 v16, 1, v20
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16
+; SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, v11, 0, s[4:5]
+; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v10, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, v3, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v2, 0, s[4:5]
+; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB0_6
+; SDAG-NEXT: ; %bb.1: ; %udiv-bb15
+; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v8
+; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v8
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v17, 0
+; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v9, vcc
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[2:3], v20
+; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v18, vcc
+; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v19, vcc
+; SDAG-NEXT: v_or_b32_e32 v18, v30, v32
+; SDAG-NEXT: v_sub_i32_e32 v34, vcc, 0x7f, v8
+; SDAG-NEXT: v_or_b32_e32 v19, v31, v33
+; SDAG-NEXT: v_lshl_b64 v[8:9], v[10:11], v34
+; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v34
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[2:3], v34
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT: v_lshr_b64 v[18:19], v[2:3], v35
+; SDAG-NEXT: v_or_b32_e32 v9, v9, v19
+; SDAG-NEXT: v_or_b32_e32 v8, v8, v18
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v21, v9, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v20, v8, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v34
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v11, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v10, s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v18, 0
+; SDAG-NEXT: v_mov_b32_e32 v19, 0
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB0_5
+; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4
+; SDAG-NEXT: v_lshr_b64 v[16:17], v[2:3], v30
+; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v30
+; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v30
+; SDAG-NEXT: v_lshr_b64 v[37:38], v[10:11], v30
+; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v28
+; SDAG-NEXT: s_mov_b64 s[10:11], 0
+; SDAG-NEXT: v_mov_b32_e32 v22, 0
+; SDAG-NEXT: v_mov_b32_e32 v23, 0
+; SDAG-NEXT: v_mov_b32_e32 v18, 0
+; SDAG-NEXT: v_mov_b32_e32 v19, 0
+; SDAG-NEXT: v_lshl_b64 v[48:49], v[10:11], v35
+; SDAG-NEXT: v_lshr_b64 v[10:11], v[10:11], v36
+; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v29, vcc
+; SDAG-NEXT: v_or_b32_e32 v17, v17, v49
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v48
+; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v11, v17, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v10, v16, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, v38, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, v37, s[4:5]
+; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30
+; SDAG-NEXT: v_cndmask_b32_e32 v3, v17, v3, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v2, v16, v2, vcc
+; SDAG-NEXT: v_mov_b32_e32 v17, 0
+; SDAG-NEXT: .LBB0_3: ; %udiv-do-while3
+; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v3
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v9
+; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v21
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
+; SDAG-NEXT: v_or_b32_e32 v10, v10, v16
+; SDAG-NEXT: v_or_b32_e32 v2, v2, v38
+; SDAG-NEXT: v_or_b32_e32 v8, v8, v39
+; SDAG-NEXT: v_or_b32_e32 v9, v19, v9
+; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v34, v2
+; SDAG-NEXT: v_or_b32_e32 v8, v18, v8
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v35, v3, vcc
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v36, v10, vcc
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v37, v11, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v38, 31, v16
+; SDAG-NEXT: v_and_b32_e32 v39, v38, v28
+; SDAG-NEXT: v_and_b32_e32 v48, v38, v29
+; SDAG-NEXT: v_and_b32_e32 v49, v38, v0
+; SDAG-NEXT: v_and_b32_e32 v16, 1, v38
+; SDAG-NEXT: v_and_b32_e32 v38, v38, v1
+; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v39
+; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v48, vcc
+; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v10, v49, vcc
+; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v38, vcc
+; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30
+; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc
+; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc
+; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc
+; SDAG-NEXT: v_or_b32_e32 v38, v30, v32
+; SDAG-NEXT: v_or_b32_e32 v39, v31, v33
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39]
+; SDAG-NEXT: v_or_b32_e32 v21, v23, v21
+; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; SDAG-NEXT: v_or_b32_e32 v20, v22, v20
+; SDAG-NEXT: v_mov_b32_e32 v23, v17
+; SDAG-NEXT: v_mov_b32_e32 v22, v16
+; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; SDAG-NEXT: s_cbranch_execnz .LBB0_3
+; SDAG-NEXT: ; %bb.4: ; %Flow13
+; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
+; SDAG-NEXT: .LBB0_5: ; %Flow14
+; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[8:9], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v21
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[20:21], 1
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v8
+; SDAG-NEXT: v_or_b32_e32 v20, v19, v1
+; SDAG-NEXT: v_or_b32_e32 v21, v17, v3
+; SDAG-NEXT: v_or_b32_e32 v17, v18, v0
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v2
+; SDAG-NEXT: .LBB0_6: ; %Flow16
+; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT: v_ashrrev_i32_e32 v18, 31, v7
+; SDAG-NEXT: v_ashrrev_i32_e32 v19, 31, v15
+; SDAG-NEXT: v_mov_b32_e32 v9, 0
+; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
+; SDAG-NEXT: v_mov_b32_e32 v22, v18
+; SDAG-NEXT: v_mov_b32_e32 v23, v19
+; SDAG-NEXT: v_xor_b32_e32 v0, v18, v7
+; SDAG-NEXT: v_xor_b32_e32 v1, v18, v6
+; SDAG-NEXT: v_xor_b32_e32 v3, v18, v5
+; SDAG-NEXT: v_xor_b32_e32 v2, v18, v4
+; SDAG-NEXT: v_xor_b32_e32 v6, v19, v15
+; SDAG-NEXT: v_xor_b32_e32 v7, v19, v14
+; SDAG-NEXT: v_xor_b32_e32 v8, v19, v13
+; SDAG-NEXT: v_xor_b32_e32 v10, v19, v12
+; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v18
+; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v18, vcc
+; SDAG-NEXT: v_ffbh_u32_e32 v5, v2
+; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v1, v18, vcc
+; SDAG-NEXT: v_add_i32_e64 v1, s[4:5], 32, v5
+; SDAG-NEXT: v_ffbh_u32_e32 v11, v3
+; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v0, v18, vcc
+; SDAG-NEXT: v_or_b32_e32 v0, v2, v4
+; SDAG-NEXT: v_ffbh_u32_e32 v12, v4
+; SDAG-NEXT: v_min_u32_e32 v11, v1, v11
+; SDAG-NEXT: v_sub_i32_e32 v28, vcc, v10, v19
+; SDAG-NEXT: v_or_b32_e32 v1, v3, v5
+; SDAG-NEXT: v_add_i32_e64 v10, s[4:5], 32, v12
+; SDAG-NEXT: v_ffbh_u32_e32 v12, v5
+; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], 64, v11
+; SDAG-NEXT: v_addc_u32_e64 v13, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v8, v19, vcc
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[0:1]
+; SDAG-NEXT: v_ffbh_u32_e32 v1, v28
+; SDAG-NEXT: v_min_u32_e32 v8, v10, v12
+; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v13, 0, s[6:7]
+; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v7, v19, vcc
+; SDAG-NEXT: v_add_i32_e64 v7, s[8:9], 32, v1
+; SDAG-NEXT: v_ffbh_u32_e32 v12, v29
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v11, v8, s[6:7]
+; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v6, v19, vcc
+; SDAG-NEXT: v_or_b32_e32 v6, v28, v0
+; SDAG-NEXT: v_ffbh_u32_e32 v11, v0
+; SDAG-NEXT: v_min_u32_e32 v12, v7, v12
+; SDAG-NEXT: v_or_b32_e32 v7, v29, v1
+; SDAG-NEXT: v_add_i32_e32 v11, vcc, 32, v11
+; SDAG-NEXT: v_ffbh_u32_e32 v13, v1
+; SDAG-NEXT: v_add_i32_e32 v12, vcc, 64, v12
+; SDAG-NEXT: v_addc_u32_e64 v14, s[6:7], 0, 0, vcc
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; SDAG-NEXT: v_min_u32_e32 v6, v11, v13
+; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v7, v14, 0, s[6:7]
+; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v6, v12, v6, s[6:7]
+; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v6, v8
+; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v10, vcc
+; SDAG-NEXT: v_xor_b32_e32 v10, 0x7f, v6
+; SDAG-NEXT: v_subbrev_u32_e32 v8, vcc, 0, v9, vcc
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, 1, s[4:5]
+; SDAG-NEXT: v_subbrev_u32_e32 v9, vcc, 0, v9, vcc
+; SDAG-NEXT: v_or_b32_e32 v10, v10, v8
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v11, v7, v9
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT: v_cndmask_b32_e32 v12, v13, v12, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT: v_and_b32_e32 v10, 1, v12
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v10
+; SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v13, v5, 0, s[4:5]
+; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT: v_cndmask_b32_e64 v11, v4, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v14, v3, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v2, 0, s[4:5]
+; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB0_12
+; SDAG-NEXT: ; %bb.7: ; %udiv-bb1
+; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v6
+; SDAG-NEXT: v_sub_i32_e64 v12, s[4:5], 63, v6
+; SDAG-NEXT: v_mov_b32_e32 v10, 0
+; SDAG-NEXT: v_mov_b32_e32 v11, 0
+; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v7, vcc
+; SDAG-NEXT: v_lshl_b64 v[12:13], v[2:3], v12
+; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v8, vcc
+; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v9, vcc
+; SDAG-NEXT: v_or_b32_e32 v7, v30, v32
+; SDAG-NEXT: v_sub_i32_e32 v9, vcc, 0x7f, v6
+; SDAG-NEXT: v_or_b32_e32 v8, v31, v33
+; SDAG-NEXT: v_lshl_b64 v[14:15], v[4:5], v9
+; SDAG-NEXT: v_sub_i32_e32 v6, vcc, 64, v9
+; SDAG-NEXT: v_lshl_b64 v[34:35], v[2:3], v9
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[7:8]
+; SDAG-NEXT: v_lshr_b64 v[6:7], v[2:3], v6
+; SDAG-NEXT: v_or_b32_e32 v7, v15, v7
+; SDAG-NEXT: v_or_b32_e32 v6, v14, v6
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v9
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v13, v7, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v12, v12, v6, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v35, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v34, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v9
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v8, v5, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v12, v4, s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: v_mov_b32_e32 v13, 0
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB0_11
+; SDAG-NEXT: ; %bb.8: ; %udiv-preheader
+; SDAG-NEXT: v_lshr_b64 v[10:11], v[2:3], v30
+; SDAG-NEXT: v_sub_i32_e32 v35, vcc, 64, v30
+; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v30
+; SDAG-NEXT: v_lshr_b64 v[37:38], v[4:5], v30
+; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v28
+; SDAG-NEXT: s_mov_b64 s[10:11], 0
+; SDAG-NEXT: v_mov_b32_e32 v14, 0
+; SDAG-NEXT: v_mov_b32_e32 v15, 0
+; SDAG-NEXT: v_mov_b32_e32 v12, 0
+; SDAG-NEXT: v_mov_b32_e32 v13, 0
+; SDAG-NEXT: v_lshl_b64 v[48:49], v[4:5], v35
+; SDAG-NEXT: v_lshr_b64 v[4:5], v[4:5], v36
+; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v29, vcc
+; SDAG-NEXT: v_or_b32_e32 v11, v11, v49
+; SDAG-NEXT: v_or_b32_e32 v10, v10, v48
+; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v0, vcc
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30
+; SDAG-NEXT: v_cndmask_b32_e64 v11, v5, v11, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v4, v10, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v5, 0, v38, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, v37, s[4:5]
+; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v1, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30
+; SDAG-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v2, v10, v2, vcc
+; SDAG-NEXT: v_mov_b32_e32 v11, 0
+; SDAG-NEXT: .LBB0_9: ; %udiv-do-while
+; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v10, 31, v3
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v9
+; SDAG-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v7
+; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
+; SDAG-NEXT: v_or_b32_e32 v4, v4, v10
+; SDAG-NEXT: v_or_b32_e32 v2, v2, v38
+; SDAG-NEXT: v_or_b32_e32 v8, v8, v39
+; SDAG-NEXT: v_or_b32_e32 v9, v13, v9
+; SDAG-NEXT: v_or_b32_e32 v7, v15, v7
+; SDAG-NEXT: v_or_b32_e32 v8, v12, v8
+; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v34, v2
+; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v35, v3, vcc
+; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v36, v4, vcc
+; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v37, v5, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v15, 31, v10
+; SDAG-NEXT: v_and_b32_e32 v10, 1, v15
+; SDAG-NEXT: v_and_b32_e32 v38, v15, v1
+; SDAG-NEXT: v_and_b32_e32 v39, v15, v0
+; SDAG-NEXT: v_and_b32_e32 v48, v15, v29
+; SDAG-NEXT: v_and_b32_e32 v15, v15, v28
+; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v15
+; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v48, vcc
+; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v39, vcc
+; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v38, vcc
+; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30
+; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc
+; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc
+; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc
+; SDAG-NEXT: v_or_b32_e32 v39, v31, v33
+; SDAG-NEXT: v_or_b32_e32 v38, v30, v32
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39]
+; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; SDAG-NEXT: v_or_b32_e32 v6, v14, v6
+; SDAG-NEXT: v_mov_b32_e32 v15, v11
+; SDAG-NEXT: v_mov_b32_e32 v14, v10
+; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; SDAG-NEXT: s_cbranch_execnz .LBB0_9
+; SDAG-NEXT: ; %bb.10: ; %Flow
+; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
+; SDAG-NEXT: .LBB0_11: ; %Flow11
+; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[8:9], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v7
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[6:7], 1
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v4
+; SDAG-NEXT: v_or_b32_e32 v13, v13, v1
+; SDAG-NEXT: v_or_b32_e32 v14, v11, v3
+; SDAG-NEXT: v_or_b32_e32 v11, v12, v0
+; SDAG-NEXT: v_or_b32_e32 v10, v10, v2
+; SDAG-NEXT: .LBB0_12: ; %Flow12
+; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT: v_xor_b32_e32 v3, v27, v26
+; SDAG-NEXT: v_xor_b32_e32 v2, v25, v24
+; SDAG-NEXT: v_xor_b32_e32 v7, v23, v22
+; SDAG-NEXT: v_xor_b32_e32 v6, v19, v18
+; SDAG-NEXT: v_xor_b32_e32 v4, v20, v3
+; SDAG-NEXT: v_xor_b32_e32 v5, v17, v2
+; SDAG-NEXT: v_xor_b32_e32 v1, v21, v3
+; SDAG-NEXT: v_xor_b32_e32 v0, v16, v2
+; SDAG-NEXT: v_xor_b32_e32 v8, v13, v7
+; SDAG-NEXT: v_xor_b32_e32 v9, v11, v6
+; SDAG-NEXT: v_xor_b32_e32 v11, v14, v7
+; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v5, v2, vcc
+; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v4, v3, vcc
+; SDAG-NEXT: v_xor_b32_e32 v4, v10, v6
+; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v6
+; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v11, v7, vcc
+; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v9, v6, vcc
+; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v8, v7, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_sdiv_v2i128_vv:
+; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b64 s[8:9], 0
+; GISEL-NEXT: v_ashrrev_i32_e32 v24, 31, v3
+; GISEL-NEXT: v_ashrrev_i32_e32 v25, 31, v11
+; GISEL-NEXT: v_mov_b32_e32 v20, 0x7f
+; GISEL-NEXT: v_mov_b32_e32 v21, 0
+; GISEL-NEXT: v_xor_b32_e32 v0, v24, v0
+; GISEL-NEXT: v_xor_b32_e32 v1, v24, v1
+; GISEL-NEXT: v_xor_b32_e32 v2, v24, v2
+; GISEL-NEXT: v_xor_b32_e32 v3, v24, v3
+; GISEL-NEXT: v_xor_b32_e32 v8, v25, v8
+; GISEL-NEXT: v_xor_b32_e32 v9, v25, v9
+; GISEL-NEXT: v_xor_b32_e32 v10, v25, v10
+; GISEL-NEXT: v_xor_b32_e32 v11, v25, v11
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v0, v24
+; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v1, v24, vcc
+; GISEL-NEXT: v_sub_i32_e64 v26, s[4:5], v8, v25
+; GISEL-NEXT: v_subb_u32_e64 v27, s[4:5], v9, v25, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v2, v24, vcc
+; GISEL-NEXT: v_subb_u32_e32 v19, vcc, v3, v24, vcc
+; GISEL-NEXT: v_subb_u32_e64 v10, vcc, v10, v25, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v11, v25, vcc
+; GISEL-NEXT: v_ffbh_u32_e32 v8, v27
+; GISEL-NEXT: v_ffbh_u32_e32 v9, v26
+; GISEL-NEXT: v_ffbh_u32_e32 v22, v17
+; GISEL-NEXT: v_ffbh_u32_e32 v23, v16
+; GISEL-NEXT: v_or_b32_e32 v0, v26, v10
+; GISEL-NEXT: v_or_b32_e32 v1, v27, v11
+; GISEL-NEXT: v_or_b32_e32 v2, v16, v18
+; GISEL-NEXT: v_or_b32_e32 v3, v17, v19
+; GISEL-NEXT: v_add_i32_e32 v9, vcc, 32, v9
+; GISEL-NEXT: v_ffbh_u32_e32 v28, v11
+; GISEL-NEXT: v_ffbh_u32_e32 v29, v10
+; GISEL-NEXT: v_add_i32_e32 v23, vcc, 32, v23
+; GISEL-NEXT: v_ffbh_u32_e32 v30, v19
+; GISEL-NEXT: v_ffbh_u32_e32 v31, v18
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
+; GISEL-NEXT: v_min_u32_e32 v0, v8, v9
+; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v29
+; GISEL-NEXT: v_min_u32_e32 v2, v22, v23
+; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v31
+; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0
+; GISEL-NEXT: v_min_u32_e32 v1, v28, v1
+; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 64, v2
+; GISEL-NEXT: v_min_u32_e32 v3, v30, v3
+; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_subb_u32_e64 v2, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[20:21]
+; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
+; GISEL-NEXT: v_xor_b32_e32 v8, 0x7f, v0
+; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v8, v8, v2
+; GISEL-NEXT: v_or_b32_e32 v9, v1, v3
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v9, v22, v20
+; GISEL-NEXT: v_and_b32_e32 v20, 1, v9
+; GISEL-NEXT: v_or_b32_e32 v8, v9, v8
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT: v_cndmask_b32_e64 v20, v16, 0, vcc
+; GISEL-NEXT: v_and_b32_e32 v22, 1, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v21, v17, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v8, v18, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v9, v19, 0, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB0_6
+; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
+; GISEL-NEXT: v_add_i32_e32 v28, vcc, 1, v0
+; GISEL-NEXT: v_addc_u32_e64 v29, s[4:5], 0, v1, vcc
+; GISEL-NEXT: v_sub_i32_e32 v32, vcc, 0x7f, v0
+; GISEL-NEXT: v_addc_u32_e64 v30, vcc, 0, v2, s[4:5]
+; GISEL-NEXT: v_addc_u32_e32 v31, vcc, 0, v3, vcc
+; GISEL-NEXT: v_subrev_i32_e64 v20, s[4:5], 64, v32
+; GISEL-NEXT: v_sub_i32_e64 v8, s[4:5], 64, v32
+; GISEL-NEXT: v_lshl_b64 v[0:1], v[16:17], v32
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[18:19], v32
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: v_lshr_b64 v[8:9], v[16:17], v8
+; GISEL-NEXT: v_lshl_b64 v[22:23], v[16:17], v20
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v32
+; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc
+; GISEL-NEXT: v_or_b32_e32 v0, v8, v2
+; GISEL-NEXT: v_or_b32_e32 v1, v9, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v0, v18, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v9, v1, v19, vcc
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GISEL-NEXT: v_mov_b32_e32 v1, s9
+; GISEL-NEXT: v_mov_b32_e32 v2, s10
+; GISEL-NEXT: v_mov_b32_e32 v3, s11
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7]
+; GISEL-NEXT: s_cbranch_execz .LBB0_5
+; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4
+; GISEL-NEXT: v_subrev_i32_e32 v34, vcc, 64, v28
+; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v28
+; GISEL-NEXT: v_lshr_b64 v[0:1], v[18:19], v28
+; GISEL-NEXT: v_lshr_b64 v[2:3], v[16:17], v28
+; GISEL-NEXT: v_add_i32_e32 v32, vcc, -1, v26
+; GISEL-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v28
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v28
+; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v27, vcc
+; GISEL-NEXT: v_lshl_b64 v[22:23], v[18:19], v22
+; GISEL-NEXT: v_lshr_b64 v[36:37], v[18:19], v34
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, v0, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, v1, s[4:5]
+; GISEL-NEXT: v_addc_u32_e32 v34, vcc, -1, v10, vcc
+; GISEL-NEXT: v_or_b32_e32 v0, v2, v22
+; GISEL-NEXT: v_or_b32_e32 v1, v3, v23
+; GISEL-NEXT: v_addc_u32_e32 v35, vcc, -1, v11, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v0, v36, v0, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v1, v37, v1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v22, v0, v16, s[6:7]
+; GISEL-NEXT: v_cndmask_b32_e64 v23, v1, v17, s[6:7]
+; GISEL-NEXT: v_mov_b32_e32 v17, 0
+; GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GISEL-NEXT: v_mov_b32_e32 v1, s9
+; GISEL-NEXT: v_mov_b32_e32 v2, s10
+; GISEL-NEXT: v_mov_b32_e32 v3, s11
+; GISEL-NEXT: .LBB0_3: ; %udiv-do-while3
+; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT: v_lshrrev_b32_e32 v16, 31, v21
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1
+; GISEL-NEXT: v_lshl_b64 v[36:37], v[22:23], 1
+; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v23
+; GISEL-NEXT: v_lshrrev_b32_e32 v23, 31, v9
+; GISEL-NEXT: v_add_i32_e32 v28, vcc, -1, v28
+; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc
+; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GISEL-NEXT: v_or_b32_e32 v20, v0, v2
+; GISEL-NEXT: v_or_b32_e32 v21, v1, v3
+; GISEL-NEXT: v_or_b32_e32 v2, v18, v22
+; GISEL-NEXT: v_or_b32_e32 v3, v36, v23
+; GISEL-NEXT: v_addc_u32_e32 v30, vcc, -1, v30, vcc
+; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc
+; GISEL-NEXT: v_or_b32_e32 v8, v8, v16
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v32, v3
+; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v33, v37, vcc
+; GISEL-NEXT: v_or_b32_e32 v0, v28, v30
+; GISEL-NEXT: v_or_b32_e32 v1, v29, v31
+; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v34, v2, vcc
+; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v35, v19, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v16
+; GISEL-NEXT: s_or_b64 s[8:9], vcc, s[8:9]
+; GISEL-NEXT: v_and_b32_e32 v1, v0, v26
+; GISEL-NEXT: v_and_b32_e32 v18, v0, v27
+; GISEL-NEXT: v_and_b32_e32 v16, 1, v0
+; GISEL-NEXT: v_and_b32_e32 v36, v0, v10
+; GISEL-NEXT: v_and_b32_e32 v0, v0, v11
+; GISEL-NEXT: v_sub_i32_e32 v22, vcc, v3, v1
+; GISEL-NEXT: v_subb_u32_e32 v23, vcc, v37, v18, vcc
+; GISEL-NEXT: v_subb_u32_e32 v18, vcc, v2, v36, vcc
+; GISEL-NEXT: v_subb_u32_e32 v19, vcc, v19, v0, vcc
+; GISEL-NEXT: v_mov_b32_e32 v0, v16
+; GISEL-NEXT: v_mov_b32_e32 v1, v17
+; GISEL-NEXT: s_andn2_b64 exec, exec, s[8:9]
+; GISEL-NEXT: s_cbranch_execnz .LBB0_3
+; GISEL-NEXT: ; %bb.4: ; %Flow13
+; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT: .LBB0_5: ; %Flow14
+; GISEL-NEXT: s_or_b64 exec, exec, s[14:15]
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1
+; GISEL-NEXT: v_lshl_b64 v[8:9], v[8:9], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v10, 31, v21
+; GISEL-NEXT: v_or_b32_e32 v8, v8, v10
+; GISEL-NEXT: v_or_b32_e32 v20, v0, v2
+; GISEL-NEXT: v_or_b32_e32 v21, v1, v3
+; GISEL-NEXT: .LBB0_6: ; %Flow16
+; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: s_mov_b64 s[8:9], 0
+; GISEL-NEXT: v_ashrrev_i32_e32 v18, 31, v7
+; GISEL-NEXT: v_ashrrev_i32_e32 v19, 31, v15
+; GISEL-NEXT: v_mov_b32_e32 v10, 0x7f
+; GISEL-NEXT: v_mov_b32_e32 v11, 0
+; GISEL-NEXT: v_xor_b32_e32 v0, v18, v4
+; GISEL-NEXT: v_xor_b32_e32 v1, v18, v5
+; GISEL-NEXT: v_xor_b32_e32 v2, v18, v6
+; GISEL-NEXT: v_xor_b32_e32 v3, v18, v7
+; GISEL-NEXT: v_xor_b32_e32 v4, v19, v12
+; GISEL-NEXT: v_xor_b32_e32 v5, v19, v13
+; GISEL-NEXT: v_xor_b32_e32 v14, v19, v14
+; GISEL-NEXT: v_xor_b32_e32 v15, v19, v15
+; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v0, v18
+; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v1, v18, vcc
+; GISEL-NEXT: v_sub_i32_e64 v22, s[4:5], v4, v19
+; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], v5, v19, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v12, vcc, v2, v18, vcc
+; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v3, v18, vcc
+; GISEL-NEXT: v_subb_u32_e64 v4, vcc, v14, v19, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v15, v19, vcc
+; GISEL-NEXT: v_ffbh_u32_e32 v14, v23
+; GISEL-NEXT: v_ffbh_u32_e32 v15, v22
+; GISEL-NEXT: v_ffbh_u32_e32 v16, v7
+; GISEL-NEXT: v_ffbh_u32_e32 v17, v6
+; GISEL-NEXT: v_or_b32_e32 v0, v22, v4
+; GISEL-NEXT: v_or_b32_e32 v1, v23, v5
+; GISEL-NEXT: v_or_b32_e32 v2, v6, v12
+; GISEL-NEXT: v_or_b32_e32 v3, v7, v13
+; GISEL-NEXT: v_add_i32_e32 v15, vcc, 32, v15
+; GISEL-NEXT: v_ffbh_u32_e32 v26, v5
+; GISEL-NEXT: v_ffbh_u32_e32 v27, v4
+; GISEL-NEXT: v_add_i32_e32 v17, vcc, 32, v17
+; GISEL-NEXT: v_ffbh_u32_e32 v28, v13
+; GISEL-NEXT: v_ffbh_u32_e32 v29, v12
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
+; GISEL-NEXT: v_min_u32_e32 v0, v14, v15
+; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v27
+; GISEL-NEXT: v_min_u32_e32 v2, v16, v17
+; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v29
+; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0
+; GISEL-NEXT: v_min_u32_e32 v1, v26, v1
+; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 64, v2
+; GISEL-NEXT: v_min_u32_e32 v3, v28, v3
+; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v14, 0, 1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13]
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_subb_u32_e64 v2, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[10:11]
+; GISEL-NEXT: v_cndmask_b32_e64 v15, 0, 1, vcc
+; GISEL-NEXT: v_xor_b32_e32 v10, 0x7f, v0
+; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT: v_cndmask_b32_e64 v16, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v10, v10, v2
+; GISEL-NEXT: v_or_b32_e32 v11, v1, v3
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT: v_cndmask_b32_e32 v15, v16, v15, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v11, v14, v15
+; GISEL-NEXT: v_and_b32_e32 v14, 1, v11
+; GISEL-NEXT: v_or_b32_e32 v10, v11, v10
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v14, v6, 0, vcc
+; GISEL-NEXT: v_and_b32_e32 v16, 1, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v15, v7, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v10, v12, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v11, v13, 0, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v16
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB0_12
+; GISEL-NEXT: ; %bb.7: ; %udiv-bb1
+; GISEL-NEXT: v_add_i32_e32 v26, vcc, 1, v0
+; GISEL-NEXT: v_addc_u32_e64 v27, s[4:5], 0, v1, vcc
+; GISEL-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v0
+; GISEL-NEXT: v_addc_u32_e64 v28, vcc, 0, v2, s[4:5]
+; GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v3, vcc
+; GISEL-NEXT: v_subrev_i32_e64 v14, s[4:5], 64, v30
+; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 64, v30
+; GISEL-NEXT: v_lshl_b64 v[0:1], v[6:7], v30
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[12:13], v30
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: v_lshr_b64 v[10:11], v[6:7], v10
+; GISEL-NEXT: v_lshl_b64 v[16:17], v[6:7], v14
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30
+; GISEL-NEXT: v_cndmask_b32_e32 v14, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v15, 0, v1, vcc
+; GISEL-NEXT: v_or_b32_e32 v0, v10, v2
+; GISEL-NEXT: v_or_b32_e32 v1, v11, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v16, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v17, v1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30
+; GISEL-NEXT: v_cndmask_b32_e32 v10, v0, v12, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v11, v1, v13, vcc
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GISEL-NEXT: v_mov_b32_e32 v1, s9
+; GISEL-NEXT: v_mov_b32_e32 v2, s10
+; GISEL-NEXT: v_mov_b32_e32 v3, s11
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
+; GISEL-NEXT: s_cbranch_execz .LBB0_11
+; GISEL-NEXT: ; %bb.8: ; %udiv-preheader
+; GISEL-NEXT: v_subrev_i32_e32 v32, vcc, 64, v26
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, 64, v26
+; GISEL-NEXT: v_lshr_b64 v[0:1], v[12:13], v26
+; GISEL-NEXT: v_lshr_b64 v[2:3], v[6:7], v26
+; GISEL-NEXT: s_mov_b64 s[4:5], 0
+; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v22
+; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v23, vcc
+; GISEL-NEXT: v_lshl_b64 v[16:17], v[12:13], v16
+; GISEL-NEXT: v_lshr_b64 v[12:13], v[12:13], v32
+; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v4, vcc
+; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v5, vcc
+; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v16
+; GISEL-NEXT: v_or_b32_e32 v3, v3, v17
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v12, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v13, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v2, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, v3, v7, vcc
+; GISEL-NEXT: v_mov_b32_e32 v7, 0
+; GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GISEL-NEXT: .LBB0_9: ; %udiv-do-while
+; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[12:13], 1
+; GISEL-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v6, 31, v13
+; GISEL-NEXT: v_lshrrev_b32_e32 v34, 31, v11
+; GISEL-NEXT: v_lshl_b64 v[12:13], v[14:15], 1
+; GISEL-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v14, 31, v15
+; GISEL-NEXT: v_add_i32_e32 v26, vcc, -1, v26
+; GISEL-NEXT: v_addc_u32_e32 v27, vcc, -1, v27, vcc
+; GISEL-NEXT: v_or_b32_e32 v16, v16, v6
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v34
+; GISEL-NEXT: v_or_b32_e32 v10, v10, v14
+; GISEL-NEXT: v_or_b32_e32 v14, v0, v12
+; GISEL-NEXT: v_or_b32_e32 v15, v1, v13
+; GISEL-NEXT: v_addc_u32_e32 v28, vcc, -1, v28, vcc
+; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v30, v2
+; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v31, v3, vcc
+; GISEL-NEXT: v_or_b32_e32 v0, v26, v28
+; GISEL-NEXT: v_or_b32_e32 v1, v27, v29
+; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v32, v16, vcc
+; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v33, v17, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v6
+; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: v_and_b32_e32 v6, 1, v0
+; GISEL-NEXT: v_and_b32_e32 v12, v0, v22
+; GISEL-NEXT: v_and_b32_e32 v13, v0, v23
+; GISEL-NEXT: v_and_b32_e32 v34, v0, v4
+; GISEL-NEXT: v_and_b32_e32 v35, v0, v5
+; GISEL-NEXT: v_mov_b32_e32 v0, v6
+; GISEL-NEXT: v_mov_b32_e32 v1, v7
+; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v2, v12
+; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v3, v13, vcc
+; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v34, vcc
+; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v35, vcc
+; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GISEL-NEXT: s_cbranch_execnz .LBB0_9
+; GISEL-NEXT: ; %bb.10: ; %Flow
+; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT: .LBB0_11: ; %Flow11
+; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[14:15], 1
+; GISEL-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v4, 31, v15
+; GISEL-NEXT: v_or_b32_e32 v10, v10, v4
+; GISEL-NEXT: v_or_b32_e32 v14, v0, v2
+; GISEL-NEXT: v_or_b32_e32 v15, v1, v3
+; GISEL-NEXT: .LBB0_12: ; %Flow12
+; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: v_xor_b32_e32 v3, v25, v24
+; GISEL-NEXT: v_xor_b32_e32 v7, v19, v18
+; GISEL-NEXT: v_xor_b32_e32 v0, v20, v3
+; GISEL-NEXT: v_xor_b32_e32 v1, v21, v3
+; GISEL-NEXT: v_xor_b32_e32 v2, v8, v3
+; GISEL-NEXT: v_xor_b32_e32 v6, v9, v3
+; GISEL-NEXT: v_xor_b32_e32 v4, v14, v7
+; GISEL-NEXT: v_xor_b32_e32 v5, v15, v7
+; GISEL-NEXT: v_xor_b32_e32 v8, v10, v7
+; GISEL-NEXT: v_xor_b32_e32 v9, v11, v7
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v3
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v3, vcc
+; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v7
+; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v5, v7, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v3, vcc
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v6, v3, vcc
+; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v8, v7, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v9, v7, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%shl = sdiv <2 x i128> %lhs, %rhs
ret <2 x i128> %shl
}
define <2 x i128> @v_udiv_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
+; SDAG-LABEL: v_udiv_v2i128_vv:
+; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_or_b32_e32 v17, v9, v11
+; SDAG-NEXT: v_or_b32_e32 v16, v8, v10
+; SDAG-NEXT: v_or_b32_e32 v19, v1, v3
+; SDAG-NEXT: v_or_b32_e32 v18, v0, v2
+; SDAG-NEXT: v_ffbh_u32_e32 v20, v10
+; SDAG-NEXT: v_ffbh_u32_e32 v21, v11
+; SDAG-NEXT: v_ffbh_u32_e32 v22, v8
+; SDAG-NEXT: v_ffbh_u32_e32 v23, v9
+; SDAG-NEXT: v_ffbh_u32_e32 v24, v2
+; SDAG-NEXT: v_ffbh_u32_e32 v25, v3
+; SDAG-NEXT: v_ffbh_u32_e32 v26, v0
+; SDAG-NEXT: v_ffbh_u32_e32 v27, v1
+; SDAG-NEXT: v_mov_b32_e32 v28, 0
+; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
+; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20
+; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v22
+; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v24
+; SDAG-NEXT: v_add_i32_e64 v19, s[6:7], 32, v26
+; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
+; SDAG-NEXT: v_min_u32_e32 v16, v16, v21
+; SDAG-NEXT: v_min_u32_e32 v17, v17, v23
+; SDAG-NEXT: v_min_u32_e32 v18, v18, v25
+; SDAG-NEXT: v_min_u32_e32 v19, v19, v27
+; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17
+; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, vcc
+; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19
+; SDAG-NEXT: v_addc_u32_e64 v21, s[4:5], 0, 0, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, v20, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc
+; SDAG-NEXT: v_sub_i32_e32 v23, vcc, v16, v18
+; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v20, v17, vcc
+; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v23
+; SDAG-NEXT: v_subbrev_u32_e32 v25, vcc, 0, v28, vcc
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[23:24]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; SDAG-NEXT: v_subbrev_u32_e32 v26, vcc, 0, v28, vcc
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v25
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[25:26]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v17, v24, v26
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[25:26]
+; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT: v_and_b32_e32 v16, 1, v18
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16
+; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v3, 0, s[4:5]
+; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v2, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v1, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v0, 0, s[4:5]
+; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT: s_and_saveexec_b64 s[8:9], s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB1_6
+; SDAG-NEXT: ; %bb.1: ; %udiv-bb15
+; SDAG-NEXT: v_add_i32_e32 v18, vcc, 1, v23
+; SDAG-NEXT: v_sub_i32_e64 v16, s[4:5], 63, v23
+; SDAG-NEXT: v_mov_b32_e32 v21, 0
+; SDAG-NEXT: v_mov_b32_e32 v22, 0
+; SDAG-NEXT: v_addc_u32_e32 v27, vcc, 0, v24, vcc
+; SDAG-NEXT: v_lshl_b64 v[16:17], v[0:1], v16
+; SDAG-NEXT: v_addc_u32_e32 v28, vcc, 0, v25, vcc
+; SDAG-NEXT: v_addc_u32_e32 v29, vcc, 0, v26, vcc
+; SDAG-NEXT: v_or_b32_e32 v19, v18, v28
+; SDAG-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v23
+; SDAG-NEXT: v_or_b32_e32 v20, v27, v29
+; SDAG-NEXT: v_lshl_b64 v[23:24], v[2:3], v30
+; SDAG-NEXT: v_sub_i32_e32 v31, vcc, 64, v30
+; SDAG-NEXT: v_lshl_b64 v[25:26], v[0:1], v30
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[19:20]
+; SDAG-NEXT: v_lshr_b64 v[19:20], v[0:1], v31
+; SDAG-NEXT: v_or_b32_e32 v20, v24, v20
+; SDAG-NEXT: v_or_b32_e32 v19, v23, v19
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v20, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v19, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v24, 0, v26, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v25, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v30
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v3, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v2, s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v19, 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB1_5
+; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4
+; SDAG-NEXT: v_lshr_b64 v[21:22], v[0:1], v18
+; SDAG-NEXT: v_sub_i32_e32 v31, vcc, 64, v18
+; SDAG-NEXT: v_subrev_i32_e32 v36, vcc, 64, v18
+; SDAG-NEXT: v_lshr_b64 v[32:33], v[2:3], v18
+; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v8
+; SDAG-NEXT: s_mov_b64 s[12:13], 0
+; SDAG-NEXT: v_mov_b32_e32 v25, 0
+; SDAG-NEXT: v_mov_b32_e32 v26, 0
+; SDAG-NEXT: v_mov_b32_e32 v19, 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v18
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v18
+; SDAG-NEXT: v_lshl_b64 v[34:35], v[2:3], v31
+; SDAG-NEXT: v_lshr_b64 v[36:37], v[2:3], v36
+; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v9, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v33, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v32, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v22, v22, v35
+; SDAG-NEXT: v_or_b32_e32 v21, v21, v34
+; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v10, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v22, v37, v22, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, v36, v21, s[4:5]
+; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v11, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v22, v1, s[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v21, v0, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v22, 0
+; SDAG-NEXT: .LBB1_3: ; %udiv-do-while3
+; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT: v_lshrrev_b32_e32 v21, 31, v24
+; SDAG-NEXT: v_lshl_b64 v[23:24], v[23:24], 1
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v34, 31, v1
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v35, 31, v17
+; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
+; SDAG-NEXT: v_or_b32_e32 v24, v26, v24
+; SDAG-NEXT: v_or_b32_e32 v23, v25, v23
+; SDAG-NEXT: v_or_b32_e32 v2, v2, v34
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v35
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v21
+; SDAG-NEXT: v_sub_i32_e32 v21, vcc, v30, v0
+; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v31, v1, vcc
+; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v32, v2, vcc
+; SDAG-NEXT: v_subb_u32_e32 v21, vcc, v33, v3, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v21, 31, v21
+; SDAG-NEXT: v_and_b32_e32 v25, v21, v8
+; SDAG-NEXT: v_and_b32_e32 v26, v21, v9
+; SDAG-NEXT: v_and_b32_e32 v34, v21, v10
+; SDAG-NEXT: v_and_b32_e32 v35, v21, v11
+; SDAG-NEXT: v_and_b32_e32 v21, 1, v21
+; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v25
+; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v26, vcc
+; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v34, vcc
+; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v35, vcc
+; SDAG-NEXT: v_add_i32_e32 v18, vcc, -1, v18
+; SDAG-NEXT: v_addc_u32_e32 v27, vcc, -1, v27, vcc
+; SDAG-NEXT: v_addc_u32_e32 v28, vcc, -1, v28, vcc
+; SDAG-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc
+; SDAG-NEXT: v_or_b32_e32 v25, v18, v28
+; SDAG-NEXT: v_or_b32_e32 v26, v27, v29
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[25:26]
+; SDAG-NEXT: v_or_b32_e32 v17, v20, v17
+; SDAG-NEXT: s_or_b64 s[12:13], vcc, s[12:13]
+; SDAG-NEXT: v_or_b32_e32 v16, v19, v16
+; SDAG-NEXT: v_mov_b32_e32 v26, v22
+; SDAG-NEXT: v_mov_b32_e32 v25, v21
+; SDAG-NEXT: s_andn2_b64 exec, exec, s[12:13]
+; SDAG-NEXT: s_cbranch_execnz .LBB1_3
+; SDAG-NEXT: ; %bb.4: ; %Flow13
+; SDAG-NEXT: s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT: .LBB1_5: ; %Flow14
+; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[16:17], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v8, 31, v24
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[23:24], 1
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v8
+; SDAG-NEXT: v_or_b32_e32 v16, v20, v1
+; SDAG-NEXT: v_or_b32_e32 v18, v22, v3
+; SDAG-NEXT: v_or_b32_e32 v17, v19, v0
+; SDAG-NEXT: v_or_b32_e32 v19, v21, v2
+; SDAG-NEXT: .LBB1_6: ; %Flow16
+; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT: v_or_b32_e32 v1, v13, v15
+; SDAG-NEXT: v_or_b32_e32 v0, v12, v14
+; SDAG-NEXT: v_or_b32_e32 v3, v5, v7
+; SDAG-NEXT: v_or_b32_e32 v2, v4, v6
+; SDAG-NEXT: v_ffbh_u32_e32 v8, v14
+; SDAG-NEXT: v_ffbh_u32_e32 v9, v15
+; SDAG-NEXT: v_ffbh_u32_e32 v10, v12
+; SDAG-NEXT: v_ffbh_u32_e32 v11, v13
+; SDAG-NEXT: v_ffbh_u32_e32 v20, v6
+; SDAG-NEXT: v_ffbh_u32_e32 v21, v7
+; SDAG-NEXT: v_ffbh_u32_e32 v22, v4
+; SDAG-NEXT: v_ffbh_u32_e32 v23, v5
+; SDAG-NEXT: v_mov_b32_e32 v24, 0
+; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
+; SDAG-NEXT: v_add_i32_e64 v0, s[6:7], 32, v8
+; SDAG-NEXT: v_add_i32_e64 v1, s[6:7], 32, v10
+; SDAG-NEXT: v_add_i32_e64 v2, s[6:7], 32, v20
+; SDAG-NEXT: v_add_i32_e64 v3, s[6:7], 32, v22
+; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
+; SDAG-NEXT: v_min_u32_e32 v0, v0, v9
+; SDAG-NEXT: v_min_u32_e32 v1, v1, v11
+; SDAG-NEXT: v_min_u32_e32 v2, v2, v21
+; SDAG-NEXT: v_min_u32_e32 v3, v3, v23
+; SDAG-NEXT: v_add_i32_e32 v1, vcc, 64, v1
+; SDAG-NEXT: v_addc_u32_e64 v8, s[4:5], 0, 0, vcc
+; SDAG-NEXT: v_add_i32_e32 v3, vcc, 64, v3
+; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], 0, 0, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v9, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v2
+; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v8, v1, vcc
+; SDAG-NEXT: v_xor_b32_e32 v8, 0x7f, v0
+; SDAG-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v24, vcc
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; SDAG-NEXT: v_subbrev_u32_e32 v3, vcc, 0, v24, vcc
+; SDAG-NEXT: v_or_b32_e32 v8, v8, v2
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v9, v1, v3
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; SDAG-NEXT: v_and_b32_e32 v8, 1, v10
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v8
+; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v7, 0, s[4:5]
+; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v6, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v5, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v11, v4, 0, s[4:5]
+; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB1_12
+; SDAG-NEXT: ; %bb.7: ; %udiv-bb1
+; SDAG-NEXT: v_add_i32_e32 v8, vcc, 1, v0
+; SDAG-NEXT: v_sub_i32_e64 v9, s[4:5], 63, v0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
+; SDAG-NEXT: v_mov_b32_e32 v21, 0
+; SDAG-NEXT: v_addc_u32_e32 v11, vcc, 0, v1, vcc
+; SDAG-NEXT: v_lshl_b64 v[9:10], v[4:5], v9
+; SDAG-NEXT: v_addc_u32_e32 v24, vcc, 0, v2, vcc
+; SDAG-NEXT: v_addc_u32_e32 v25, vcc, 0, v3, vcc
+; SDAG-NEXT: v_or_b32_e32 v1, v8, v24
+; SDAG-NEXT: v_sub_i32_e32 v3, vcc, 0x7f, v0
+; SDAG-NEXT: v_or_b32_e32 v2, v11, v25
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[6:7], v3
+; SDAG-NEXT: v_sub_i32_e32 v0, vcc, 64, v3
+; SDAG-NEXT: v_lshl_b64 v[26:27], v[4:5], v3
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[1:2]
+; SDAG-NEXT: v_lshr_b64 v[0:1], v[4:5], v0
+; SDAG-NEXT: v_or_b32_e32 v1, v23, v1
+; SDAG-NEXT: v_or_b32_e32 v0, v22, v0
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v3
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v10, v1, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v27, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v26, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v3
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v2, v7, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v9, v6, s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v9, 0
+; SDAG-NEXT: v_mov_b32_e32 v10, 0
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB1_11
+; SDAG-NEXT: ; %bb.8: ; %udiv-preheader
+; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v8
+; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v8
+; SDAG-NEXT: v_subrev_i32_e32 v28, vcc, 64, v8
+; SDAG-NEXT: v_lshr_b64 v[29:30], v[6:7], v8
+; SDAG-NEXT: v_add_i32_e32 v26, vcc, -1, v12
+; SDAG-NEXT: s_mov_b64 s[10:11], 0
+; SDAG-NEXT: v_mov_b32_e32 v22, 0
+; SDAG-NEXT: v_mov_b32_e32 v23, 0
+; SDAG-NEXT: v_mov_b32_e32 v9, 0
+; SDAG-NEXT: v_mov_b32_e32 v10, 0
+; SDAG-NEXT: v_lshl_b64 v[31:32], v[6:7], v27
+; SDAG-NEXT: v_lshr_b64 v[6:7], v[6:7], v28
+; SDAG-NEXT: v_addc_u32_e32 v27, vcc, -1, v13, vcc
+; SDAG-NEXT: v_or_b32_e32 v21, v21, v32
+; SDAG-NEXT: v_or_b32_e32 v20, v20, v31
+; SDAG-NEXT: v_addc_u32_e32 v28, vcc, -1, v14, vcc
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v8
+; SDAG-NEXT: v_cndmask_b32_e64 v21, v7, v21, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, v6, v20, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v30, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v29, s[4:5]
+; SDAG-NEXT: v_addc_u32_e32 v29, vcc, -1, v15, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
+; SDAG-NEXT: v_cndmask_b32_e32 v5, v21, v5, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v4, v20, v4, vcc
+; SDAG-NEXT: v_mov_b32_e32 v21, 0
+; SDAG-NEXT: .LBB1_9: ; %udiv-do-while
+; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT: v_lshl_b64 v[6:7], v[6:7], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v5
+; SDAG-NEXT: v_lshl_b64 v[4:5], v[4:5], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v30, 31, v3
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v31, 31, v1
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
+; SDAG-NEXT: v_or_b32_e32 v6, v6, v20
+; SDAG-NEXT: v_or_b32_e32 v4, v4, v30
+; SDAG-NEXT: v_or_b32_e32 v2, v2, v31
+; SDAG-NEXT: v_or_b32_e32 v3, v10, v3
+; SDAG-NEXT: v_or_b32_e32 v1, v23, v1
+; SDAG-NEXT: v_or_b32_e32 v2, v9, v2
+; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v26, v4
+; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v27, v5, vcc
+; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v28, v6, vcc
+; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v29, v7, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v23, 31, v20
+; SDAG-NEXT: v_and_b32_e32 v20, 1, v23
+; SDAG-NEXT: v_and_b32_e32 v30, v23, v15
+; SDAG-NEXT: v_and_b32_e32 v31, v23, v14
+; SDAG-NEXT: v_and_b32_e32 v32, v23, v13
+; SDAG-NEXT: v_and_b32_e32 v23, v23, v12
+; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v23
+; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v32, vcc
+; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v31, vcc
+; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v30, vcc
+; SDAG-NEXT: v_add_i32_e32 v8, vcc, -1, v8
+; SDAG-NEXT: v_addc_u32_e32 v11, vcc, -1, v11, vcc
+; SDAG-NEXT: v_addc_u32_e32 v24, vcc, -1, v24, vcc
+; SDAG-NEXT: v_addc_u32_e32 v25, vcc, -1, v25, vcc
+; SDAG-NEXT: v_or_b32_e32 v31, v11, v25
+; SDAG-NEXT: v_or_b32_e32 v30, v8, v24
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[30:31]
+; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; SDAG-NEXT: v_or_b32_e32 v0, v22, v0
+; SDAG-NEXT: v_mov_b32_e32 v23, v21
+; SDAG-NEXT: v_mov_b32_e32 v22, v20
+; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; SDAG-NEXT: s_cbranch_execnz .LBB1_9
+; SDAG-NEXT: ; %bb.10: ; %Flow
+; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
+; SDAG-NEXT: .LBB1_11: ; %Flow11
+; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v4, 31, v1
+; SDAG-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
+; SDAG-NEXT: v_or_b32_e32 v2, v2, v4
+; SDAG-NEXT: v_or_b32_e32 v8, v10, v3
+; SDAG-NEXT: v_or_b32_e32 v10, v21, v1
+; SDAG-NEXT: v_or_b32_e32 v9, v9, v2
+; SDAG-NEXT: v_or_b32_e32 v11, v20, v0
+; SDAG-NEXT: .LBB1_12: ; %Flow12
+; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v0, v19
+; SDAG-NEXT: v_mov_b32_e32 v1, v18
+; SDAG-NEXT: v_mov_b32_e32 v2, v17
+; SDAG-NEXT: v_mov_b32_e32 v3, v16
+; SDAG-NEXT: v_mov_b32_e32 v4, v11
+; SDAG-NEXT: v_mov_b32_e32 v5, v10
+; SDAG-NEXT: v_mov_b32_e32 v6, v9
+; SDAG-NEXT: v_mov_b32_e32 v7, v8
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_udiv_v2i128_vv:
+; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v16, v2
+; GISEL-NEXT: v_mov_b32_e32 v17, v3
+; GISEL-NEXT: s_mov_b64 s[8:9], 0
+; GISEL-NEXT: v_or_b32_e32 v2, v8, v10
+; GISEL-NEXT: v_or_b32_e32 v3, v9, v11
+; GISEL-NEXT: v_or_b32_e32 v18, v0, v16
+; GISEL-NEXT: v_or_b32_e32 v19, v1, v17
+; GISEL-NEXT: v_ffbh_u32_e32 v20, v9
+; GISEL-NEXT: v_ffbh_u32_e32 v21, v8
+; GISEL-NEXT: v_ffbh_u32_e32 v22, v11
+; GISEL-NEXT: v_ffbh_u32_e32 v23, v10
+; GISEL-NEXT: v_ffbh_u32_e32 v26, v1
+; GISEL-NEXT: v_ffbh_u32_e32 v27, v0
+; GISEL-NEXT: v_ffbh_u32_e32 v28, v17
+; GISEL-NEXT: v_ffbh_u32_e32 v29, v16
+; GISEL-NEXT: v_mov_b32_e32 v24, 0x7f
+; GISEL-NEXT: v_mov_b32_e32 v25, 0
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
+; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 32, v21
+; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v23
+; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], 32, v27
+; GISEL-NEXT: v_add_i32_e64 v19, s[6:7], 32, v29
+; GISEL-NEXT: v_min_u32_e32 v2, v20, v2
+; GISEL-NEXT: v_min_u32_e32 v3, v22, v3
+; GISEL-NEXT: v_min_u32_e32 v18, v26, v18
+; GISEL-NEXT: v_min_u32_e32 v19, v28, v19
+; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v2, vcc, 64, v2
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, 64, v18
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v19, v18, vcc
+; GISEL-NEXT: v_sub_i32_e32 v20, vcc, v2, v3
+; GISEL-NEXT: v_subb_u32_e64 v21, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_subb_u32_e64 v22, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[20:21], v[24:25]
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, 0x7f, v20
+; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[22:23]
+; GISEL-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v22
+; GISEL-NEXT: v_or_b32_e32 v3, v21, v23
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[22:23]
+; GISEL-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v3, v26, v18
+; GISEL-NEXT: v_and_b32_e32 v18, 1, v3
+; GISEL-NEXT: v_or_b32_e32 v2, v3, v2
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v18, v0, 0, vcc
+; GISEL-NEXT: v_and_b32_e32 v24, 1, v2
+; GISEL-NEXT: v_cndmask_b32_e64 v19, v1, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v2, v16, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v3, v17, 0, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB1_6
+; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
+; GISEL-NEXT: v_add_i32_e32 v26, vcc, 1, v20
+; GISEL-NEXT: v_addc_u32_e64 v27, s[4:5], 0, v21, vcc
+; GISEL-NEXT: v_sub_i32_e32 v30, vcc, 0x7f, v20
+; GISEL-NEXT: v_addc_u32_e64 v28, vcc, 0, v22, s[4:5]
+; GISEL-NEXT: v_addc_u32_e32 v29, vcc, 0, v23, vcc
+; GISEL-NEXT: v_subrev_i32_e64 v22, s[4:5], 64, v30
+; GISEL-NEXT: v_sub_i32_e64 v20, s[4:5], 64, v30
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[0:1], v30
+; GISEL-NEXT: v_lshl_b64 v[18:19], v[16:17], v30
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: v_lshr_b64 v[20:21], v[0:1], v20
+; GISEL-NEXT: v_lshl_b64 v[24:25], v[0:1], v22
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30
+; GISEL-NEXT: v_cndmask_b32_e32 v22, 0, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v23, 0, v3, vcc
+; GISEL-NEXT: v_or_b32_e32 v2, v20, v18
+; GISEL-NEXT: v_or_b32_e32 v3, v21, v19
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v24, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v25, v3, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v2, v16, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT: v_mov_b32_e32 v21, s11
+; GISEL-NEXT: v_mov_b32_e32 v20, s10
+; GISEL-NEXT: v_mov_b32_e32 v19, s9
+; GISEL-NEXT: v_mov_b32_e32 v18, s8
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
+; GISEL-NEXT: s_cbranch_execz .LBB1_5
+; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4
+; GISEL-NEXT: v_subrev_i32_e32 v32, vcc, 64, v26
+; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 64, v26
+; GISEL-NEXT: v_lshr_b64 v[18:19], v[16:17], v26
+; GISEL-NEXT: v_lshr_b64 v[20:21], v[0:1], v26
+; GISEL-NEXT: s_mov_b64 s[4:5], 0
+; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v8
+; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v9, vcc
+; GISEL-NEXT: v_lshl_b64 v[24:25], v[16:17], v24
+; GISEL-NEXT: v_lshr_b64 v[16:17], v[16:17], v32
+; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v10, vcc
+; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v11, vcc
+; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT: v_or_b32_e32 v20, v20, v24
+; GISEL-NEXT: v_or_b32_e32 v21, v21, v25
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26
+; GISEL-NEXT: v_cndmask_b32_e32 v20, v16, v20, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v21, v17, v21, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v18, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v19, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26
+; GISEL-NEXT: v_cndmask_b32_e32 v24, v20, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v25, v21, v1, vcc
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: v_mov_b32_e32 v21, s7
+; GISEL-NEXT: v_mov_b32_e32 v20, s6
+; GISEL-NEXT: v_mov_b32_e32 v19, s5
+; GISEL-NEXT: v_mov_b32_e32 v18, s4
+; GISEL-NEXT: .LBB1_3: ; %udiv-do-while3
+; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT: v_lshrrev_b32_e32 v34, 31, v23
+; GISEL-NEXT: v_lshl_b64 v[20:21], v[22:23], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v25
+; GISEL-NEXT: v_lshl_b64 v[24:25], v[24:25], 1
+; GISEL-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v35, 31, v3
+; GISEL-NEXT: v_add_i32_e32 v26, vcc, -1, v26
+; GISEL-NEXT: v_addc_u32_e32 v27, vcc, -1, v27, vcc
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; GISEL-NEXT: v_or_b32_e32 v22, v18, v20
+; GISEL-NEXT: v_or_b32_e32 v23, v19, v21
+; GISEL-NEXT: v_or_b32_e32 v16, v16, v0
+; GISEL-NEXT: v_or_b32_e32 v20, v24, v35
+; GISEL-NEXT: v_addc_u32_e32 v28, vcc, -1, v28, vcc
+; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v29, vcc
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v30, v20
+; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v31, v25, vcc
+; GISEL-NEXT: v_or_b32_e32 v18, v26, v28
+; GISEL-NEXT: v_or_b32_e32 v19, v27, v29
+; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v32, v16, vcc
+; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v33, v17, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v0
+; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: v_and_b32_e32 v18, v0, v8
+; GISEL-NEXT: v_and_b32_e32 v19, v0, v9
+; GISEL-NEXT: v_and_b32_e32 v21, v0, v10
+; GISEL-NEXT: v_and_b32_e32 v35, v0, v11
+; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v20, v18
+; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v25, v19, vcc
+; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v21, vcc
+; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v35, vcc
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v34
+; GISEL-NEXT: v_mov_b32_e32 v19, v1
+; GISEL-NEXT: v_mov_b32_e32 v18, v0
+; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GISEL-NEXT: s_cbranch_execnz .LBB1_3
+; GISEL-NEXT: ; %bb.4: ; %Flow13
+; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT: .LBB1_5: ; %Flow14
+; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT: v_lshl_b64 v[0:1], v[22:23], 1
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[2:3], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v8, 31, v23
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v8
+; GISEL-NEXT: v_or_b32_e32 v18, v18, v0
+; GISEL-NEXT: v_or_b32_e32 v19, v19, v1
+; GISEL-NEXT: .LBB1_6: ; %Flow16
+; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: s_mov_b64 s[8:9], 0
+; GISEL-NEXT: v_or_b32_e32 v0, v12, v14
+; GISEL-NEXT: v_or_b32_e32 v1, v13, v15
+; GISEL-NEXT: v_or_b32_e32 v8, v4, v6
+; GISEL-NEXT: v_or_b32_e32 v9, v5, v7
+; GISEL-NEXT: v_ffbh_u32_e32 v16, v13
+; GISEL-NEXT: v_ffbh_u32_e32 v17, v12
+; GISEL-NEXT: v_ffbh_u32_e32 v20, v15
+; GISEL-NEXT: v_ffbh_u32_e32 v21, v14
+; GISEL-NEXT: v_ffbh_u32_e32 v22, v5
+; GISEL-NEXT: v_ffbh_u32_e32 v23, v4
+; GISEL-NEXT: v_ffbh_u32_e32 v24, v7
+; GISEL-NEXT: v_ffbh_u32_e32 v25, v6
+; GISEL-NEXT: v_mov_b32_e32 v10, 0x7f
+; GISEL-NEXT: v_mov_b32_e32 v11, 0
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[8:9]
+; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 32, v17
+; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v21
+; GISEL-NEXT: v_add_i32_e64 v8, s[6:7], 32, v23
+; GISEL-NEXT: v_add_i32_e64 v9, s[6:7], 32, v25
+; GISEL-NEXT: v_min_u32_e32 v0, v16, v0
+; GISEL-NEXT: v_min_u32_e32 v1, v20, v1
+; GISEL-NEXT: v_min_u32_e32 v8, v22, v8
+; GISEL-NEXT: v_min_u32_e32 v9, v24, v9
+; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v0, vcc, 64, v0
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, 64, v8
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v9, v8, vcc
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_subb_u32_e64 v16, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[10:11]
+; GISEL-NEXT: v_cndmask_b32_e64 v10, 0, 1, vcc
+; GISEL-NEXT: v_xor_b32_e32 v8, 0x7f, v0
+; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT: v_cndmask_b32_e64 v11, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v8, v8, v16
+; GISEL-NEXT: v_or_b32_e32 v9, v1, v17
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT: v_cndmask_b32_e32 v10, v11, v10, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GISEL-NEXT: v_cndmask_b32_e64 v8, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v9, v20, v10
+; GISEL-NEXT: v_and_b32_e32 v10, 1, v9
+; GISEL-NEXT: v_or_b32_e32 v8, v9, v8
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v10
+; GISEL-NEXT: v_cndmask_b32_e64 v10, v4, 0, vcc
+; GISEL-NEXT: v_and_b32_e32 v20, 1, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v11, v5, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v8, v6, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v9, v7, 0, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB1_12
+; GISEL-NEXT: ; %bb.7: ; %udiv-bb1
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, 1, v0
+; GISEL-NEXT: v_addc_u32_e64 v11, s[4:5], 0, v1, vcc
+; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v0
+; GISEL-NEXT: v_addc_u32_e64 v24, vcc, 0, v16, s[4:5]
+; GISEL-NEXT: v_addc_u32_e32 v25, vcc, 0, v17, vcc
+; GISEL-NEXT: v_subrev_i32_e64 v9, s[4:5], 64, v26
+; GISEL-NEXT: v_sub_i32_e64 v10, s[4:5], 64, v26
+; GISEL-NEXT: v_lshl_b64 v[0:1], v[4:5], v26
+; GISEL-NEXT: v_lshl_b64 v[16:17], v[6:7], v26
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: v_lshr_b64 v[20:21], v[4:5], v10
+; GISEL-NEXT: v_lshl_b64 v[22:23], v[4:5], v9
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26
+; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v1, vcc
+; GISEL-NEXT: v_or_b32_e32 v0, v20, v16
+; GISEL-NEXT: v_or_b32_e32 v1, v21, v17
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v1, v7, vcc
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT: v_mov_b32_e32 v23, s11
+; GISEL-NEXT: v_mov_b32_e32 v22, s10
+; GISEL-NEXT: v_mov_b32_e32 v21, s9
+; GISEL-NEXT: v_mov_b32_e32 v20, s8
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
+; GISEL-NEXT: s_cbranch_execz .LBB1_11
+; GISEL-NEXT: ; %bb.8: ; %udiv-preheader
+; GISEL-NEXT: v_subrev_i32_e32 v28, vcc, 64, v8
+; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v8
+; GISEL-NEXT: v_lshr_b64 v[16:17], v[6:7], v8
+; GISEL-NEXT: v_lshr_b64 v[20:21], v[4:5], v8
+; GISEL-NEXT: s_mov_b64 s[4:5], 0
+; GISEL-NEXT: v_add_i32_e32 v26, vcc, -1, v12
+; GISEL-NEXT: v_addc_u32_e32 v27, vcc, -1, v13, vcc
+; GISEL-NEXT: v_lshl_b64 v[22:23], v[6:7], v22
+; GISEL-NEXT: v_lshr_b64 v[6:7], v[6:7], v28
+; GISEL-NEXT: v_addc_u32_e32 v28, vcc, -1, v14, vcc
+; GISEL-NEXT: v_addc_u32_e32 v29, vcc, -1, v15, vcc
+; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT: v_or_b32_e32 v20, v20, v22
+; GISEL-NEXT: v_or_b32_e32 v21, v21, v23
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v20, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v21, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v16, 0, v16, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v17, 0, v17, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v8
+; GISEL-NEXT: v_cndmask_b32_e32 v6, v6, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v7, v7, v5, vcc
+; GISEL-NEXT: v_mov_b32_e32 v5, 0
+; GISEL-NEXT: v_mov_b32_e32 v23, s7
+; GISEL-NEXT: v_mov_b32_e32 v22, s6
+; GISEL-NEXT: v_mov_b32_e32 v21, s5
+; GISEL-NEXT: v_mov_b32_e32 v20, s4
+; GISEL-NEXT: .LBB1_9: ; %udiv-do-while
+; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT: v_lshl_b64 v[22:23], v[6:7], 1
+; GISEL-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v4, 31, v7
+; GISEL-NEXT: v_lshrrev_b32_e32 v30, 31, v1
+; GISEL-NEXT: v_lshl_b64 v[6:7], v[9:10], 1
+; GISEL-NEXT: v_lshl_b64 v[0:1], v[0:1], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v9, 31, v10
+; GISEL-NEXT: v_add_i32_e32 v8, vcc, -1, v8
+; GISEL-NEXT: v_addc_u32_e32 v11, vcc, -1, v11, vcc
+; GISEL-NEXT: v_or_b32_e32 v16, v16, v4
+; GISEL-NEXT: v_or_b32_e32 v22, v22, v30
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v9
+; GISEL-NEXT: v_or_b32_e32 v9, v20, v6
+; GISEL-NEXT: v_or_b32_e32 v10, v21, v7
+; GISEL-NEXT: v_addc_u32_e32 v24, vcc, -1, v24, vcc
+; GISEL-NEXT: v_addc_u32_e32 v25, vcc, -1, v25, vcc
+; GISEL-NEXT: v_sub_i32_e32 v4, vcc, v26, v22
+; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v27, v23, vcc
+; GISEL-NEXT: v_or_b32_e32 v6, v8, v24
+; GISEL-NEXT: v_or_b32_e32 v7, v11, v25
+; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v28, v16, vcc
+; GISEL-NEXT: v_subb_u32_e32 v4, vcc, v29, v17, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v4
+; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: v_and_b32_e32 v4, 1, v6
+; GISEL-NEXT: v_and_b32_e32 v7, v6, v12
+; GISEL-NEXT: v_and_b32_e32 v30, v6, v13
+; GISEL-NEXT: v_and_b32_e32 v31, v6, v14
+; GISEL-NEXT: v_and_b32_e32 v32, v6, v15
+; GISEL-NEXT: v_mov_b32_e32 v21, v5
+; GISEL-NEXT: v_mov_b32_e32 v20, v4
+; GISEL-NEXT: v_sub_i32_e32 v6, vcc, v22, v7
+; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v23, v30, vcc
+; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v16, v31, vcc
+; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v17, v32, vcc
+; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GISEL-NEXT: s_cbranch_execnz .LBB1_9
+; GISEL-NEXT: ; %bb.10: ; %Flow
+; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT: .LBB1_11: ; %Flow11
+; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT: v_lshl_b64 v[4:5], v[9:10], 1
+; GISEL-NEXT: v_lshl_b64 v[8:9], v[0:1], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v10
+; GISEL-NEXT: v_or_b32_e32 v8, v8, v0
+; GISEL-NEXT: v_or_b32_e32 v10, v20, v4
+; GISEL-NEXT: v_or_b32_e32 v11, v21, v5
+; GISEL-NEXT: .LBB1_12: ; %Flow12
+; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: v_mov_b32_e32 v0, v18
+; GISEL-NEXT: v_mov_b32_e32 v1, v19
+; GISEL-NEXT: v_mov_b32_e32 v4, v10
+; GISEL-NEXT: v_mov_b32_e32 v5, v11
+; GISEL-NEXT: v_mov_b32_e32 v6, v8
+; GISEL-NEXT: v_mov_b32_e32 v7, v9
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%shl = udiv <2 x i128> %lhs, %rhs
ret <2 x i128> %shl
}
define <2 x i128> @v_srem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
+; SDAG-LABEL: v_srem_v2i128_vv:
+; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s32 ; 4-byte Folded Spill
+; SDAG-NEXT: v_ashrrev_i32_e32 v28, 31, v3
+; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v11
+; SDAG-NEXT: v_mov_b32_e32 v17, 0
+; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
+; SDAG-NEXT: v_mov_b32_e32 v29, v28
+; SDAG-NEXT: v_xor_b32_e32 v18, v3, v28
+; SDAG-NEXT: v_xor_b32_e32 v19, v2, v28
+; SDAG-NEXT: v_xor_b32_e32 v1, v1, v28
+; SDAG-NEXT: v_xor_b32_e32 v0, v0, v28
+; SDAG-NEXT: v_xor_b32_e32 v11, v11, v16
+; SDAG-NEXT: v_xor_b32_e32 v10, v10, v16
+; SDAG-NEXT: v_xor_b32_e32 v20, v9, v16
+; SDAG-NEXT: v_xor_b32_e32 v9, v8, v16
+; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v0, v28
+; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v1, v28, vcc
+; SDAG-NEXT: v_ffbh_u32_e32 v1, v2
+; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v19, v28, vcc
+; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 32, v1
+; SDAG-NEXT: v_ffbh_u32_e32 v21, v3
+; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v18, v28, vcc
+; SDAG-NEXT: v_or_b32_e32 v8, v2, v0
+; SDAG-NEXT: v_ffbh_u32_e32 v18, v0
+; SDAG-NEXT: v_min_u32_e32 v19, v19, v21
+; SDAG-NEXT: v_sub_i32_e32 v31, vcc, v9, v16
+; SDAG-NEXT: v_or_b32_e32 v9, v3, v1
+; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 32, v18
+; SDAG-NEXT: v_ffbh_u32_e32 v21, v1
+; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], 64, v19
+; SDAG-NEXT: v_addc_u32_e64 v22, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_subb_u32_e32 v30, vcc, v20, v16, vcc
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[8:9]
+; SDAG-NEXT: v_ffbh_u32_e32 v9, v31
+; SDAG-NEXT: v_min_u32_e32 v18, v18, v21
+; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, v22, 0, s[6:7]
+; SDAG-NEXT: v_subb_u32_e32 v8, vcc, v10, v16, vcc
+; SDAG-NEXT: v_add_i32_e64 v21, s[8:9], 32, v9
+; SDAG-NEXT: v_ffbh_u32_e32 v22, v30
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v19, v18, s[6:7]
+; SDAG-NEXT: v_subb_u32_e32 v9, vcc, v11, v16, vcc
+; SDAG-NEXT: v_or_b32_e32 v10, v31, v8
+; SDAG-NEXT: v_ffbh_u32_e32 v16, v8
+; SDAG-NEXT: v_min_u32_e32 v19, v21, v22
+; SDAG-NEXT: v_or_b32_e32 v11, v30, v9
+; SDAG-NEXT: v_add_i32_e32 v16, vcc, 32, v16
+; SDAG-NEXT: v_ffbh_u32_e32 v21, v9
+; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19
+; SDAG-NEXT: v_addc_u32_e64 v22, s[6:7], 0, 0, vcc
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT: v_min_u32_e32 v10, v16, v21
+; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[8:9]
+; SDAG-NEXT: v_cndmask_b32_e64 v11, v22, 0, s[6:7]
+; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v19, v10, s[6:7]
+; SDAG-NEXT: v_sub_i32_e32 v10, vcc, v10, v18
+; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v11, v20, vcc
+; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v10
+; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v17, vcc
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[10:11]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v17, vcc
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v18
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v17, v11, v19
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT: v_cndmask_b32_e32 v20, v21, v20, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT: v_and_b32_e32 v16, 1, v20
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16
+; SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v35, v1, 0, s[4:5]
+; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v27, v3, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v33, v2, 0, s[4:5]
+; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB2_6
+; SDAG-NEXT: ; %bb.1: ; %udiv-bb15
+; SDAG-NEXT: v_add_i32_e32 v32, vcc, 1, v10
+; SDAG-NEXT: v_sub_i32_e64 v20, s[4:5], 63, v10
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v17, 0
+; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v11, vcc
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[2:3], v20
+; SDAG-NEXT: v_addc_u32_e32 v34, vcc, 0, v18, vcc
+; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v19, vcc
+; SDAG-NEXT: v_or_b32_e32 v18, v32, v34
+; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v10
+; SDAG-NEXT: v_or_b32_e32 v19, v33, v35
+; SDAG-NEXT: v_lshl_b64 v[10:11], v[0:1], v24
+; SDAG-NEXT: v_sub_i32_e32 v25, vcc, 64, v24
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[2:3], v24
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT: v_lshr_b64 v[18:19], v[2:3], v25
+; SDAG-NEXT: v_or_b32_e32 v11, v11, v19
+; SDAG-NEXT: v_or_b32_e32 v10, v10, v18
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v24
+; SDAG-NEXT: v_cndmask_b32_e64 v11, v21, v11, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v20, v10, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, 0, v23, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, 0, v22, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v24
+; SDAG-NEXT: v_cndmask_b32_e64 v11, v11, v1, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v10, v10, v0, s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v18, 0
+; SDAG-NEXT: v_mov_b32_e32 v19, 0
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB2_5
+; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4
+; SDAG-NEXT: v_lshr_b64 v[16:17], v[2:3], v32
+; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 64, v32
+; SDAG-NEXT: v_subrev_i32_e32 v37, vcc, 64, v32
+; SDAG-NEXT: v_lshr_b64 v[24:25], v[0:1], v32
+; SDAG-NEXT: v_add_i32_e32 v36, vcc, -1, v31
+; SDAG-NEXT: s_mov_b64 s[10:11], 0
+; SDAG-NEXT: v_mov_b32_e32 v22, 0
+; SDAG-NEXT: v_mov_b32_e32 v23, 0
+; SDAG-NEXT: v_mov_b32_e32 v18, 0
+; SDAG-NEXT: v_mov_b32_e32 v19, 0
+; SDAG-NEXT: v_lshl_b64 v[26:27], v[0:1], v26
+; SDAG-NEXT: v_lshr_b64 v[48:49], v[0:1], v37
+; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v30, vcc
+; SDAG-NEXT: v_or_b32_e32 v17, v17, v27
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v26
+; SDAG-NEXT: v_addc_u32_e32 v38, vcc, -1, v8, vcc
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v32
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v49, v17, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v48, v16, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v27, 0, v25, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v26, 0, v24, s[4:5]
+; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v9, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v32
+; SDAG-NEXT: v_cndmask_b32_e32 v25, v17, v3, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v24, v16, v2, vcc
+; SDAG-NEXT: v_mov_b32_e32 v17, 0
+; SDAG-NEXT: .LBB2_3: ; %udiv-do-while3
+; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v25
+; SDAG-NEXT: v_lshl_b64 v[24:25], v[24:25], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v48, 31, v11
+; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v49, 31, v21
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
+; SDAG-NEXT: v_or_b32_e32 v26, v26, v16
+; SDAG-NEXT: v_or_b32_e32 v24, v24, v48
+; SDAG-NEXT: v_or_b32_e32 v10, v10, v49
+; SDAG-NEXT: v_or_b32_e32 v11, v19, v11
+; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v36, v24
+; SDAG-NEXT: v_or_b32_e32 v10, v18, v10
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v37, v25, vcc
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v38, v26, vcc
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v39, v27, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v16
+; SDAG-NEXT: v_and_b32_e32 v48, v16, v31
+; SDAG-NEXT: v_and_b32_e32 v49, v16, v30
+; SDAG-NEXT: v_and_b32_e32 v50, v16, v8
+; SDAG-NEXT: v_and_b32_e32 v51, v16, v9
+; SDAG-NEXT: v_and_b32_e32 v16, 1, v16
+; SDAG-NEXT: v_sub_i32_e32 v24, vcc, v24, v48
+; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v25, v49, vcc
+; SDAG-NEXT: v_subb_u32_e32 v26, vcc, v26, v50, vcc
+; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v51, vcc
+; SDAG-NEXT: v_add_i32_e32 v32, vcc, -1, v32
+; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc
+; SDAG-NEXT: v_addc_u32_e32 v34, vcc, -1, v34, vcc
+; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc
+; SDAG-NEXT: v_or_b32_e32 v48, v32, v34
+; SDAG-NEXT: v_or_b32_e32 v49, v33, v35
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[48:49]
+; SDAG-NEXT: v_or_b32_e32 v21, v23, v21
+; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; SDAG-NEXT: v_or_b32_e32 v20, v22, v20
+; SDAG-NEXT: v_mov_b32_e32 v23, v17
+; SDAG-NEXT: v_mov_b32_e32 v22, v16
+; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; SDAG-NEXT: s_cbranch_execnz .LBB2_3
+; SDAG-NEXT: ; %bb.4: ; %Flow13
+; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
+; SDAG-NEXT: .LBB2_5: ; %Flow14
+; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT: v_lshl_b64 v[10:11], v[10:11], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v22, 31, v21
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
+; SDAG-NEXT: v_or_b32_e32 v10, v10, v22
+; SDAG-NEXT: v_or_b32_e32 v35, v19, v11
+; SDAG-NEXT: v_or_b32_e32 v27, v17, v21
+; SDAG-NEXT: v_or_b32_e32 v32, v18, v10
+; SDAG-NEXT: v_or_b32_e32 v33, v16, v20
+; SDAG-NEXT: .LBB2_6: ; %Flow16
+; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT: v_ashrrev_i32_e32 v26, 31, v7
+; SDAG-NEXT: v_ashrrev_i32_e32 v16, 31, v15
+; SDAG-NEXT: v_mov_b32_e32 v17, 0
+; SDAG-NEXT: s_mov_b64 s[10:11], 0x7f
+; SDAG-NEXT: v_mov_b32_e32 v34, v26
+; SDAG-NEXT: v_xor_b32_e32 v10, v7, v26
+; SDAG-NEXT: v_xor_b32_e32 v11, v6, v26
+; SDAG-NEXT: v_xor_b32_e32 v5, v5, v26
+; SDAG-NEXT: v_xor_b32_e32 v4, v4, v26
+; SDAG-NEXT: v_xor_b32_e32 v15, v15, v16
+; SDAG-NEXT: v_xor_b32_e32 v14, v14, v16
+; SDAG-NEXT: v_xor_b32_e32 v13, v13, v16
+; SDAG-NEXT: v_xor_b32_e32 v12, v12, v16
+; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v4, v26
+; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v5, v26, vcc
+; SDAG-NEXT: v_ffbh_u32_e32 v5, v6
+; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v11, v26, vcc
+; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], 32, v5
+; SDAG-NEXT: v_ffbh_u32_e32 v18, v7
+; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v10, v26, vcc
+; SDAG-NEXT: v_or_b32_e32 v10, v6, v4
+; SDAG-NEXT: v_ffbh_u32_e32 v19, v4
+; SDAG-NEXT: v_min_u32_e32 v18, v11, v18
+; SDAG-NEXT: v_sub_i32_e32 v37, vcc, v12, v16
+; SDAG-NEXT: v_or_b32_e32 v11, v7, v5
+; SDAG-NEXT: v_add_i32_e64 v12, s[4:5], 32, v19
+; SDAG-NEXT: v_ffbh_u32_e32 v19, v5
+; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], 64, v18
+; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_subb_u32_e32 v36, vcc, v13, v16, vcc
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[10:11]
+; SDAG-NEXT: v_ffbh_u32_e32 v11, v37
+; SDAG-NEXT: v_min_u32_e32 v12, v12, v19
+; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v20, 0, s[6:7]
+; SDAG-NEXT: v_subb_u32_e32 v10, vcc, v14, v16, vcc
+; SDAG-NEXT: v_add_i32_e64 v13, s[8:9], 32, v11
+; SDAG-NEXT: v_ffbh_u32_e32 v14, v36
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v12, s[6:7]
+; SDAG-NEXT: v_subb_u32_e32 v11, vcc, v15, v16, vcc
+; SDAG-NEXT: v_or_b32_e32 v12, v37, v10
+; SDAG-NEXT: v_ffbh_u32_e32 v15, v10
+; SDAG-NEXT: v_min_u32_e32 v14, v13, v14
+; SDAG-NEXT: v_or_b32_e32 v13, v36, v11
+; SDAG-NEXT: v_add_i32_e32 v15, vcc, 32, v15
+; SDAG-NEXT: v_ffbh_u32_e32 v16, v11
+; SDAG-NEXT: v_add_i32_e32 v14, vcc, 64, v14
+; SDAG-NEXT: v_addc_u32_e64 v20, s[6:7], 0, 0, vcc
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[12:13]
+; SDAG-NEXT: v_min_u32_e32 v12, v15, v16
+; SDAG-NEXT: v_cmp_ne_u64_e64 s[6:7], 0, v[10:11]
+; SDAG-NEXT: v_cndmask_b32_e64 v13, v20, 0, s[6:7]
+; SDAG-NEXT: s_or_b64 s[8:9], vcc, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v12, v14, v12, s[6:7]
+; SDAG-NEXT: v_sub_i32_e32 v12, vcc, v12, v18
+; SDAG-NEXT: v_subb_u32_e32 v13, vcc, v13, v19, vcc
+; SDAG-NEXT: v_xor_b32_e32 v16, 0x7f, v12
+; SDAG-NEXT: v_subbrev_u32_e32 v14, vcc, 0, v17, vcc
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[10:11], v[12:13]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, 0, 1, s[4:5]
+; SDAG-NEXT: v_subbrev_u32_e32 v15, vcc, 0, v17, vcc
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v14
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, 0, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v17, v13, v15
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
+; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT: v_and_b32_e32 v16, 1, v18
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16
+; SDAG-NEXT: s_or_b64 s[4:5], s[8:9], s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v5, 0, s[4:5]
+; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v4, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v7, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v6, 0, s[4:5]
+; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB2_12
+; SDAG-NEXT: ; %bb.7: ; %udiv-bb1
+; SDAG-NEXT: v_add_i32_e32 v38, vcc, 1, v12
+; SDAG-NEXT: v_sub_i32_e64 v18, s[4:5], 63, v12
+; SDAG-NEXT: v_mov_b32_e32 v16, 0
+; SDAG-NEXT: v_mov_b32_e32 v17, 0
+; SDAG-NEXT: v_addc_u32_e32 v39, vcc, 0, v13, vcc
+; SDAG-NEXT: v_lshl_b64 v[18:19], v[6:7], v18
+; SDAG-NEXT: v_addc_u32_e32 v48, vcc, 0, v14, vcc
+; SDAG-NEXT: v_addc_u32_e32 v49, vcc, 0, v15, vcc
+; SDAG-NEXT: v_or_b32_e32 v13, v38, v48
+; SDAG-NEXT: v_sub_i32_e32 v15, vcc, 0x7f, v12
+; SDAG-NEXT: v_or_b32_e32 v14, v39, v49
+; SDAG-NEXT: v_lshl_b64 v[20:21], v[4:5], v15
+; SDAG-NEXT: v_sub_i32_e32 v12, vcc, 64, v15
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[6:7], v15
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[13:14]
+; SDAG-NEXT: v_lshr_b64 v[12:13], v[6:7], v12
+; SDAG-NEXT: v_or_b32_e32 v13, v21, v13
+; SDAG-NEXT: v_or_b32_e32 v12, v20, v12
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v15
+; SDAG-NEXT: v_cndmask_b32_e64 v14, v19, v13, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v18, v12, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v23, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v22, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
+; SDAG-NEXT: v_cndmask_b32_e64 v15, v14, v5, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v14, v18, v4, s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v18, 0
+; SDAG-NEXT: v_mov_b32_e32 v19, 0
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB2_11
+; SDAG-NEXT: ; %bb.8: ; %udiv-preheader
+; SDAG-NEXT: v_lshr_b64 v[16:17], v[6:7], v38
+; SDAG-NEXT: v_sub_i32_e32 v24, vcc, 64, v38
+; SDAG-NEXT: v_subrev_i32_e32 v51, vcc, 64, v38
+; SDAG-NEXT: v_lshr_b64 v[22:23], v[4:5], v38
+; SDAG-NEXT: v_add_i32_e32 v50, vcc, -1, v37
+; SDAG-NEXT: s_mov_b64 s[10:11], 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
+; SDAG-NEXT: v_mov_b32_e32 v21, 0
+; SDAG-NEXT: v_mov_b32_e32 v18, 0
+; SDAG-NEXT: v_mov_b32_e32 v19, 0
+; SDAG-NEXT: v_lshl_b64 v[24:25], v[4:5], v24
+; SDAG-NEXT: v_lshr_b64 v[53:54], v[4:5], v51
+; SDAG-NEXT: v_addc_u32_e32 v51, vcc, -1, v36, vcc
+; SDAG-NEXT: v_or_b32_e32 v17, v17, v25
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v24
+; SDAG-NEXT: v_addc_u32_e32 v52, vcc, -1, v10, vcc
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v38
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v54, v17, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v53, v16, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v25, 0, v23, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v24, 0, v22, s[4:5]
+; SDAG-NEXT: v_addc_u32_e32 v53, vcc, -1, v11, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v38
+; SDAG-NEXT: v_cndmask_b32_e32 v23, v17, v7, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v22, v16, v6, vcc
+; SDAG-NEXT: v_mov_b32_e32 v17, 0
+; SDAG-NEXT: .LBB2_9: ; %udiv-do-while
+; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT: v_lshl_b64 v[24:25], v[24:25], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v16, 31, v23
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v54, 31, v15
+; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v55, 31, v13
+; SDAG-NEXT: v_lshl_b64 v[12:13], v[12:13], 1
+; SDAG-NEXT: v_or_b32_e32 v24, v24, v16
+; SDAG-NEXT: v_or_b32_e32 v22, v22, v54
+; SDAG-NEXT: v_or_b32_e32 v14, v14, v55
+; SDAG-NEXT: v_or_b32_e32 v15, v19, v15
+; SDAG-NEXT: v_or_b32_e32 v13, v21, v13
+; SDAG-NEXT: v_or_b32_e32 v14, v18, v14
+; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v50, v22
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v51, v23, vcc
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v52, v24, vcc
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v53, v25, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v21, 31, v16
+; SDAG-NEXT: v_and_b32_e32 v16, 1, v21
+; SDAG-NEXT: v_and_b32_e32 v54, v21, v11
+; SDAG-NEXT: v_and_b32_e32 v55, v21, v10
+; SDAG-NEXT: v_and_b32_e32 v40, v21, v36
+; SDAG-NEXT: v_and_b32_e32 v21, v21, v37
+; SDAG-NEXT: v_sub_i32_e32 v22, vcc, v22, v21
+; SDAG-NEXT: v_subb_u32_e32 v23, vcc, v23, v40, vcc
+; SDAG-NEXT: v_subb_u32_e32 v24, vcc, v24, v55, vcc
+; SDAG-NEXT: v_subb_u32_e32 v25, vcc, v25, v54, vcc
+; SDAG-NEXT: v_add_i32_e32 v38, vcc, -1, v38
+; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v39, vcc
+; SDAG-NEXT: v_addc_u32_e32 v48, vcc, -1, v48, vcc
+; SDAG-NEXT: v_addc_u32_e32 v49, vcc, -1, v49, vcc
+; SDAG-NEXT: v_or_b32_e32 v55, v39, v49
+; SDAG-NEXT: v_or_b32_e32 v54, v38, v48
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[54:55]
+; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; SDAG-NEXT: v_or_b32_e32 v12, v20, v12
+; SDAG-NEXT: v_mov_b32_e32 v21, v17
+; SDAG-NEXT: v_mov_b32_e32 v20, v16
+; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; SDAG-NEXT: s_cbranch_execnz .LBB2_9
+; SDAG-NEXT: ; %bb.10: ; %Flow
+; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
+; SDAG-NEXT: .LBB2_11: ; %Flow11
+; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT: v_lshl_b64 v[14:15], v[14:15], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v13
+; SDAG-NEXT: v_lshl_b64 v[12:13], v[12:13], 1
+; SDAG-NEXT: v_or_b32_e32 v14, v14, v20
+; SDAG-NEXT: v_or_b32_e32 v19, v19, v15
+; SDAG-NEXT: v_or_b32_e32 v17, v17, v13
+; SDAG-NEXT: v_or_b32_e32 v18, v18, v14
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v12
+; SDAG-NEXT: .LBB2_12: ; %Flow12
+; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT: v_mul_lo_u32 v14, v33, v9
+; SDAG-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v33, v8, 0
+; SDAG-NEXT: v_mul_lo_u32 v24, v27, v8
+; SDAG-NEXT: v_mul_lo_u32 v25, v35, v31
+; SDAG-NEXT: v_mul_lo_u32 v35, v32, v30
+; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v33, 0
+; SDAG-NEXT: v_mov_b32_e32 v15, 0
+; SDAG-NEXT: v_mul_lo_u32 v38, v16, v11
+; SDAG-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v16, v10, 0
+; SDAG-NEXT: v_mul_lo_u32 v39, v17, v10
+; SDAG-NEXT: v_mul_lo_u32 v19, v19, v37
+; SDAG-NEXT: v_mul_lo_u32 v48, v18, v36
+; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v37, v16, 0
+; SDAG-NEXT: v_add_i32_e32 v13, vcc, v13, v14
+; SDAG-NEXT: v_mov_b32_e32 v14, v9
+; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v30, v33, v[14:15]
+; SDAG-NEXT: v_sub_i32_e32 v2, vcc, v2, v8
+; SDAG-NEXT: v_add_i32_e64 v14, s[4:5], v21, v38
+; SDAG-NEXT: v_add_i32_e64 v13, s[4:5], v13, v24
+; SDAG-NEXT: v_mov_b32_e32 v24, v23
+; SDAG-NEXT: v_mov_b32_e32 v23, v15
+; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v31, v27, v[22:23]
+; SDAG-NEXT: v_xor_b32_e32 v33, v2, v28
+; SDAG-NEXT: v_add_i32_e64 v21, s[4:5], v14, v39
+; SDAG-NEXT: v_mov_b32_e32 v14, v11
+; SDAG-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v36, v16, v[14:15]
+; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v32, v31, v[12:13]
+; SDAG-NEXT: v_mov_b32_e32 v2, v9
+; SDAG-NEXT: v_add_i32_e64 v13, s[4:5], v24, v2
+; SDAG-NEXT: v_addc_u32_e64 v14, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v2, v8
+; SDAG-NEXT: v_subb_u32_e32 v16, vcc, v3, v2, vcc
+; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v18, v37, v[20:21]
+; SDAG-NEXT: v_mov_b32_e32 v18, v23
+; SDAG-NEXT: v_mov_b32_e32 v23, v15
+; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v37, v17, v[22:23]
+; SDAG-NEXT: v_add_i32_e64 v20, s[4:5], v25, v12
+; SDAG-NEXT: v_mad_u64_u32 v[12:13], s[4:5], v30, v27, v[13:14]
+; SDAG-NEXT: v_xor_b32_e32 v16, v16, v29
+; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v19, v3
+; SDAG-NEXT: v_add_i32_e64 v14, s[4:5], v18, v9
+; SDAG-NEXT: v_addc_u32_e64 v15, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v18, v8
+; SDAG-NEXT: v_add_i32_e64 v19, s[4:5], v35, v20
+; SDAG-NEXT: v_add_i32_e64 v3, s[4:5], v48, v3
+; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v36, v17, v[14:15]
+; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v12, v11
+; SDAG-NEXT: v_addc_u32_e64 v12, s[4:5], v13, v19, s[4:5]
+; SDAG-NEXT: v_subb_u32_e32 v0, vcc, v0, v11, vcc
+; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v2
+; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v3, s[4:5]
+; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v12, vcc
+; SDAG-NEXT: v_xor_b32_e32 v2, v0, v28
+; SDAG-NEXT: v_xor_b32_e32 v3, v1, v29
+; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v33, v28
+; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v16, v29, vcc
+; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v28, vcc
+; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v29, vcc
+; SDAG-NEXT: v_sub_i32_e32 v6, vcc, v6, v10
+; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v18, vcc
+; SDAG-NEXT: v_xor_b32_e32 v6, v6, v26
+; SDAG-NEXT: v_subb_u32_e32 v4, vcc, v4, v8, vcc
+; SDAG-NEXT: v_xor_b32_e32 v7, v7, v34
+; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v9, vcc
+; SDAG-NEXT: v_xor_b32_e32 v8, v4, v26
+; SDAG-NEXT: v_xor_b32_e32 v9, v5, v34
+; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v6, v26
+; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v7, v34, vcc
+; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v8, v26, vcc
+; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v9, v34, vcc
+; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload
+; SDAG-NEXT: s_waitcnt vmcnt(0)
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_srem_v2i128_vv:
+; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b64 s[8:9], 0
+; GISEL-NEXT: v_ashrrev_i32_e32 v28, 31, v3
+; GISEL-NEXT: v_ashrrev_i32_e32 v20, 31, v11
+; GISEL-NEXT: v_mov_b32_e32 v18, 0x7f
+; GISEL-NEXT: v_mov_b32_e32 v19, 0
+; GISEL-NEXT: v_xor_b32_e32 v0, v0, v28
+; GISEL-NEXT: v_xor_b32_e32 v1, v1, v28
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v28
+; GISEL-NEXT: v_xor_b32_e32 v3, v3, v28
+; GISEL-NEXT: v_xor_b32_e32 v8, v8, v20
+; GISEL-NEXT: v_xor_b32_e32 v9, v9, v20
+; GISEL-NEXT: v_xor_b32_e32 v10, v10, v20
+; GISEL-NEXT: v_xor_b32_e32 v11, v11, v20
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v0, v28
+; GISEL-NEXT: v_subb_u32_e32 v17, vcc, v1, v28, vcc
+; GISEL-NEXT: v_sub_i32_e64 v30, s[4:5], v8, v20
+; GISEL-NEXT: v_subb_u32_e64 v29, s[4:5], v9, v20, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v2, v28, vcc
+; GISEL-NEXT: v_subb_u32_e32 v9, vcc, v3, v28, vcc
+; GISEL-NEXT: v_subb_u32_e64 v10, vcc, v10, v20, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v11, vcc, v11, v20, vcc
+; GISEL-NEXT: v_ffbh_u32_e32 v20, v29
+; GISEL-NEXT: v_ffbh_u32_e32 v21, v30
+; GISEL-NEXT: v_ffbh_u32_e32 v22, v17
+; GISEL-NEXT: v_ffbh_u32_e32 v23, v16
+; GISEL-NEXT: v_or_b32_e32 v0, v30, v10
+; GISEL-NEXT: v_or_b32_e32 v1, v29, v11
+; GISEL-NEXT: v_or_b32_e32 v2, v16, v8
+; GISEL-NEXT: v_or_b32_e32 v3, v17, v9
+; GISEL-NEXT: v_add_i32_e32 v21, vcc, 32, v21
+; GISEL-NEXT: v_ffbh_u32_e32 v24, v11
+; GISEL-NEXT: v_ffbh_u32_e32 v25, v10
+; GISEL-NEXT: v_add_i32_e32 v23, vcc, 32, v23
+; GISEL-NEXT: v_ffbh_u32_e32 v26, v9
+; GISEL-NEXT: v_ffbh_u32_e32 v27, v8
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[2:3]
+; GISEL-NEXT: v_min_u32_e32 v0, v20, v21
+; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v25
+; GISEL-NEXT: v_min_u32_e32 v2, v22, v23
+; GISEL-NEXT: v_add_i32_e64 v3, s[6:7], 32, v27
+; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0
+; GISEL-NEXT: v_min_u32_e32 v1, v24, v1
+; GISEL-NEXT: v_add_i32_e64 v2, s[6:7], 64, v2
+; GISEL-NEXT: v_min_u32_e32 v3, v26, v3
+; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[8:9]
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_subb_u32_e64 v2, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_subb_u32_e64 v3, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[18:19]
+; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
+; GISEL-NEXT: v_xor_b32_e32 v18, 0x7f, v0
+; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v18, v18, v2
+; GISEL-NEXT: v_or_b32_e32 v19, v1, v3
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v19, v20, v21
+; GISEL-NEXT: v_and_b32_e32 v20, 1, v19
+; GISEL-NEXT: v_or_b32_e32 v18, v19, v18
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT: v_cndmask_b32_e64 v31, v16, 0, vcc
+; GISEL-NEXT: v_and_b32_e32 v20, 1, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v32, v17, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v18, v8, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v19, v9, 0, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB2_6
+; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
+; GISEL-NEXT: v_add_i32_e32 v31, vcc, 1, v0
+; GISEL-NEXT: v_addc_u32_e64 v32, s[4:5], 0, v1, vcc
+; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v0
+; GISEL-NEXT: v_addc_u32_e64 v33, vcc, 0, v2, s[4:5]
+; GISEL-NEXT: v_addc_u32_e32 v34, vcc, 0, v3, vcc
+; GISEL-NEXT: v_subrev_i32_e64 v20, s[4:5], 64, v24
+; GISEL-NEXT: v_sub_i32_e64 v18, s[4:5], 64, v24
+; GISEL-NEXT: v_lshl_b64 v[0:1], v[16:17], v24
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[8:9], v24
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: v_lshr_b64 v[18:19], v[16:17], v18
+; GISEL-NEXT: v_lshl_b64 v[22:23], v[16:17], v20
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24
+; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc
+; GISEL-NEXT: v_or_b32_e32 v0, v18, v2
+; GISEL-NEXT: v_or_b32_e32 v1, v19, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24
+; GISEL-NEXT: v_cndmask_b32_e32 v18, v0, v8, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v19, v1, v9, vcc
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GISEL-NEXT: v_mov_b32_e32 v1, s9
+; GISEL-NEXT: v_mov_b32_e32 v2, s10
+; GISEL-NEXT: v_mov_b32_e32 v3, s11
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
+; GISEL-NEXT: s_cbranch_execz .LBB2_5
+; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4
+; GISEL-NEXT: v_subrev_i32_e32 v24, vcc, 64, v31
+; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v31
+; GISEL-NEXT: v_lshr_b64 v[0:1], v[8:9], v31
+; GISEL-NEXT: v_lshr_b64 v[2:3], v[16:17], v31
+; GISEL-NEXT: s_mov_b64 s[4:5], 0
+; GISEL-NEXT: v_add_i32_e32 v35, vcc, -1, v30
+; GISEL-NEXT: v_addc_u32_e32 v36, vcc, -1, v29, vcc
+; GISEL-NEXT: v_lshl_b64 v[22:23], v[8:9], v22
+; GISEL-NEXT: v_lshr_b64 v[24:25], v[8:9], v24
+; GISEL-NEXT: v_addc_u32_e32 v37, vcc, -1, v10, vcc
+; GISEL-NEXT: v_addc_u32_e32 v38, vcc, -1, v11, vcc
+; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v22
+; GISEL-NEXT: v_or_b32_e32 v3, v3, v23
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v31
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v24, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v25, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v27, 0, v1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v31
+; GISEL-NEXT: v_cndmask_b32_e32 v24, v2, v16, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v25, v3, v17, vcc
+; GISEL-NEXT: v_mov_b32_e32 v23, 0
+; GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GISEL-NEXT: .LBB2_3: ; %udiv-do-while3
+; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v21
+; GISEL-NEXT: v_lshl_b64 v[48:49], v[24:25], 1
+; GISEL-NEXT: v_lshl_b64 v[26:27], v[26:27], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v25
+; GISEL-NEXT: v_lshrrev_b32_e32 v25, 31, v19
+; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
+; GISEL-NEXT: v_add_i32_e32 v31, vcc, -1, v31
+; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc
+; GISEL-NEXT: v_or_b32_e32 v20, v0, v2
+; GISEL-NEXT: v_or_b32_e32 v21, v1, v3
+; GISEL-NEXT: v_or_b32_e32 v2, v26, v24
+; GISEL-NEXT: v_or_b32_e32 v3, v48, v25
+; GISEL-NEXT: v_or_b32_e32 v18, v18, v22
+; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc
+; GISEL-NEXT: v_addc_u32_e32 v34, vcc, -1, v34, vcc
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v35, v3
+; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v36, v49, vcc
+; GISEL-NEXT: v_or_b32_e32 v0, v31, v33
+; GISEL-NEXT: v_or_b32_e32 v1, v32, v34
+; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v37, v2, vcc
+; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v38, v27, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v22
+; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: v_and_b32_e32 v22, 1, v0
+; GISEL-NEXT: v_and_b32_e32 v1, v0, v30
+; GISEL-NEXT: v_and_b32_e32 v25, v0, v29
+; GISEL-NEXT: v_and_b32_e32 v26, v0, v10
+; GISEL-NEXT: v_and_b32_e32 v0, v0, v11
+; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v3, v1
+; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v49, v25, vcc
+; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v2, v26, vcc
+; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v0, vcc
+; GISEL-NEXT: v_mov_b32_e32 v0, v22
+; GISEL-NEXT: v_mov_b32_e32 v1, v23
+; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GISEL-NEXT: s_cbranch_execnz .LBB2_3
+; GISEL-NEXT: ; %bb.4: ; %Flow13
+; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT: .LBB2_5: ; %Flow14
+; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1
+; GISEL-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v20, 31, v21
+; GISEL-NEXT: v_or_b32_e32 v18, v18, v20
+; GISEL-NEXT: v_or_b32_e32 v31, v0, v2
+; GISEL-NEXT: v_or_b32_e32 v32, v1, v3
+; GISEL-NEXT: .LBB2_6: ; %Flow16
+; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: s_mov_b64 s[8:9], 0
+; GISEL-NEXT: v_ashrrev_i32_e32 v33, 31, v7
+; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v15
+; GISEL-NEXT: v_mov_b32_e32 v2, 0x7f
+; GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GISEL-NEXT: v_xor_b32_e32 v1, v4, v33
+; GISEL-NEXT: v_xor_b32_e32 v4, v5, v33
+; GISEL-NEXT: v_xor_b32_e32 v5, v6, v33
+; GISEL-NEXT: v_xor_b32_e32 v7, v7, v33
+; GISEL-NEXT: v_xor_b32_e32 v6, v12, v0
+; GISEL-NEXT: v_xor_b32_e32 v20, v13, v0
+; GISEL-NEXT: v_xor_b32_e32 v14, v14, v0
+; GISEL-NEXT: v_xor_b32_e32 v15, v15, v0
+; GISEL-NEXT: v_sub_i32_e32 v12, vcc, v1, v33
+; GISEL-NEXT: v_subb_u32_e32 v13, vcc, v4, v33, vcc
+; GISEL-NEXT: v_sub_i32_e64 v35, s[4:5], v6, v0
+; GISEL-NEXT: v_subb_u32_e64 v34, s[4:5], v20, v0, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v6, vcc, v5, v33, vcc
+; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v7, v33, vcc
+; GISEL-NEXT: v_subb_u32_e64 v4, vcc, v14, v0, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v5, vcc, v15, v0, vcc
+; GISEL-NEXT: v_ffbh_u32_e32 v20, v34
+; GISEL-NEXT: v_ffbh_u32_e32 v21, v35
+; GISEL-NEXT: v_ffbh_u32_e32 v22, v13
+; GISEL-NEXT: v_ffbh_u32_e32 v23, v12
+; GISEL-NEXT: v_or_b32_e32 v0, v35, v4
+; GISEL-NEXT: v_or_b32_e32 v1, v34, v5
+; GISEL-NEXT: v_or_b32_e32 v14, v12, v6
+; GISEL-NEXT: v_or_b32_e32 v15, v13, v7
+; GISEL-NEXT: v_add_i32_e32 v21, vcc, 32, v21
+; GISEL-NEXT: v_ffbh_u32_e32 v24, v5
+; GISEL-NEXT: v_ffbh_u32_e32 v25, v4
+; GISEL-NEXT: v_add_i32_e32 v23, vcc, 32, v23
+; GISEL-NEXT: v_ffbh_u32_e32 v26, v7
+; GISEL-NEXT: v_ffbh_u32_e32 v27, v6
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[14:15]
+; GISEL-NEXT: v_min_u32_e32 v0, v20, v21
+; GISEL-NEXT: v_add_i32_e64 v1, s[6:7], 32, v25
+; GISEL-NEXT: v_min_u32_e32 v14, v22, v23
+; GISEL-NEXT: v_add_i32_e64 v15, s[6:7], 32, v27
+; GISEL-NEXT: v_add_i32_e64 v0, s[6:7], 64, v0
+; GISEL-NEXT: v_min_u32_e32 v1, v24, v1
+; GISEL-NEXT: v_add_i32_e64 v14, s[6:7], 64, v14
+; GISEL-NEXT: v_min_u32_e32 v15, v26, v15
+; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, s[4:5]
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v15, v14, vcc
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v1
+; GISEL-NEXT: v_subb_u32_e64 v1, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_subb_u32_e64 v15, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT: v_cndmask_b32_e64 v21, 0, 1, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, 0x7f, v0
+; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[14:15]
+; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v14
+; GISEL-NEXT: v_or_b32_e32 v3, v1, v15
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
+; GISEL-NEXT: v_cndmask_b32_e32 v21, v22, v21, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v3, v20, v21
+; GISEL-NEXT: v_and_b32_e32 v20, 1, v3
+; GISEL-NEXT: v_or_b32_e32 v2, v3, v2
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v20
+; GISEL-NEXT: v_cndmask_b32_e64 v20, v12, 0, vcc
+; GISEL-NEXT: v_and_b32_e32 v22, 1, v2
+; GISEL-NEXT: v_cndmask_b32_e64 v21, v13, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v2, v6, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v3, v7, 0, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB2_12
+; GISEL-NEXT: ; %bb.7: ; %udiv-bb1
+; GISEL-NEXT: v_add_i32_e32 v36, vcc, 1, v0
+; GISEL-NEXT: v_addc_u32_e64 v37, s[4:5], 0, v1, vcc
+; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 0x7f, v0
+; GISEL-NEXT: v_addc_u32_e64 v38, vcc, 0, v14, s[4:5]
+; GISEL-NEXT: v_addc_u32_e32 v39, vcc, 0, v15, vcc
+; GISEL-NEXT: v_subrev_i32_e64 v20, s[4:5], 64, v24
+; GISEL-NEXT: v_sub_i32_e64 v14, s[4:5], 64, v24
+; GISEL-NEXT: v_lshl_b64 v[0:1], v[12:13], v24
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[6:7], v24
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: v_lshr_b64 v[14:15], v[12:13], v14
+; GISEL-NEXT: v_lshl_b64 v[22:23], v[12:13], v20
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v24
+; GISEL-NEXT: v_cndmask_b32_e32 v20, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v21, 0, v1, vcc
+; GISEL-NEXT: v_or_b32_e32 v0, v14, v2
+; GISEL-NEXT: v_or_b32_e32 v1, v15, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v22, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v23, v1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v24
+; GISEL-NEXT: v_cndmask_b32_e32 v14, v0, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v15, v1, v7, vcc
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT: v_mov_b32_e32 v0, s8
+; GISEL-NEXT: v_mov_b32_e32 v1, s9
+; GISEL-NEXT: v_mov_b32_e32 v2, s10
+; GISEL-NEXT: v_mov_b32_e32 v3, s11
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
+; GISEL-NEXT: s_cbranch_execz .LBB2_11
+; GISEL-NEXT: ; %bb.8: ; %udiv-preheader
+; GISEL-NEXT: v_subrev_i32_e32 v24, vcc, 64, v36
+; GISEL-NEXT: v_sub_i32_e32 v22, vcc, 64, v36
+; GISEL-NEXT: v_lshr_b64 v[0:1], v[6:7], v36
+; GISEL-NEXT: v_lshr_b64 v[2:3], v[12:13], v36
+; GISEL-NEXT: s_mov_b64 s[4:5], 0
+; GISEL-NEXT: v_add_i32_e32 v48, vcc, -1, v35
+; GISEL-NEXT: v_addc_u32_e32 v49, vcc, -1, v34, vcc
+; GISEL-NEXT: v_lshl_b64 v[22:23], v[6:7], v22
+; GISEL-NEXT: v_lshr_b64 v[24:25], v[6:7], v24
+; GISEL-NEXT: v_addc_u32_e32 v50, vcc, -1, v4, vcc
+; GISEL-NEXT: v_addc_u32_e32 v51, vcc, -1, v5, vcc
+; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v22
+; GISEL-NEXT: v_or_b32_e32 v3, v3, v23
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v36
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v24, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v25, v3, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v26, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v27, 0, v1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v36
+; GISEL-NEXT: v_cndmask_b32_e32 v24, v2, v12, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v25, v3, v13, vcc
+; GISEL-NEXT: v_mov_b32_e32 v23, 0
+; GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GISEL-NEXT: .LBB2_9: ; %udiv-do-while
+; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[20:21], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v21
+; GISEL-NEXT: v_lshl_b64 v[52:53], v[24:25], 1
+; GISEL-NEXT: v_lshl_b64 v[26:27], v[26:27], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v25
+; GISEL-NEXT: v_lshrrev_b32_e32 v25, 31, v15
+; GISEL-NEXT: v_lshl_b64 v[14:15], v[14:15], 1
+; GISEL-NEXT: v_add_i32_e32 v36, vcc, -1, v36
+; GISEL-NEXT: v_addc_u32_e32 v37, vcc, -1, v37, vcc
+; GISEL-NEXT: v_or_b32_e32 v20, v0, v2
+; GISEL-NEXT: v_or_b32_e32 v21, v1, v3
+; GISEL-NEXT: v_or_b32_e32 v2, v26, v24
+; GISEL-NEXT: v_or_b32_e32 v3, v52, v25
+; GISEL-NEXT: v_or_b32_e32 v14, v14, v22
+; GISEL-NEXT: v_addc_u32_e32 v38, vcc, -1, v38, vcc
+; GISEL-NEXT: v_addc_u32_e32 v39, vcc, -1, v39, vcc
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v48, v3
+; GISEL-NEXT: v_subb_u32_e32 v0, vcc, v49, v53, vcc
+; GISEL-NEXT: v_or_b32_e32 v0, v36, v38
+; GISEL-NEXT: v_or_b32_e32 v1, v37, v39
+; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v50, v2, vcc
+; GISEL-NEXT: v_subb_u32_e32 v22, vcc, v51, v27, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_ashrrev_i32_e32 v0, 31, v22
+; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: v_and_b32_e32 v22, 1, v0
+; GISEL-NEXT: v_and_b32_e32 v1, v0, v35
+; GISEL-NEXT: v_and_b32_e32 v25, v0, v34
+; GISEL-NEXT: v_and_b32_e32 v26, v0, v4
+; GISEL-NEXT: v_and_b32_e32 v52, v0, v5
+; GISEL-NEXT: v_sub_i32_e32 v24, vcc, v3, v1
+; GISEL-NEXT: v_subb_u32_e32 v25, vcc, v53, v25, vcc
+; GISEL-NEXT: v_mov_b32_e32 v0, v22
+; GISEL-NEXT: v_mov_b32_e32 v1, v23
+; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v2, v26, vcc
+; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v27, v52, vcc
+; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GISEL-NEXT: s_cbranch_execnz .LBB2_9
+; GISEL-NEXT: ; %bb.10: ; %Flow
+; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT: .LBB2_11: ; %Flow11
+; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT: v_lshl_b64 v[22:23], v[20:21], 1
+; GISEL-NEXT: v_lshl_b64 v[2:3], v[14:15], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v14, 31, v21
+; GISEL-NEXT: v_or_b32_e32 v2, v2, v14
+; GISEL-NEXT: v_or_b32_e32 v20, v0, v22
+; GISEL-NEXT: v_or_b32_e32 v21, v1, v23
+; GISEL-NEXT: .LBB2_12: ; %Flow12
+; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v30, v31, 0
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v30, v18, 0
+; GISEL-NEXT: v_mul_lo_u32 v24, v30, v19
+; GISEL-NEXT: v_mul_lo_u32 v25, v29, v18
+; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v35, v20, 0
+; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v2, 0
+; GISEL-NEXT: v_mul_lo_u32 v26, v35, v3
+; GISEL-NEXT: v_mul_lo_u32 v27, v34, v2
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v29, v32, v[14:15]
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v34, v21, v[22:23]
+; GISEL-NEXT: v_mov_b32_e32 v22, v19
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v31, v[2:3]
+; GISEL-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v4, v20, v[14:15]
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v30, v32, v[1:2]
+; GISEL-NEXT: v_mov_b32_e32 v23, v14
+; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v35, v21, v[22:23]
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v29, v31, v[1:2]
+; GISEL-NEXT: v_addc_u32_e64 v3, s[6:7], v3, v24, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[6:7], v34, v20, v[22:23]
+; GISEL-NEXT: v_addc_u32_e64 v14, s[6:7], v15, v26, s[6:7]
+; GISEL-NEXT: v_addc_u32_e32 v3, vcc, v3, v25, vcc
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v16, v0
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v17, v1, vcc
+; GISEL-NEXT: v_xor_b32_e32 v15, v0, v28
+; GISEL-NEXT: v_addc_u32_e64 v0, s[4:5], v14, v27, s[4:5]
+; GISEL-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v18
+; GISEL-NEXT: v_subb_u32_e64 v14, s[4:5], v13, v22, s[4:5]
+; GISEL-NEXT: v_xor_b32_e32 v16, v12, v33
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v10, v32, v[3:4]
+; GISEL-NEXT: v_xor_b32_e32 v1, v1, v28
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v4, v21, v[0:1]
+; GISEL-NEXT: v_xor_b32_e32 v14, v14, v33
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v11, v31, v[12:13]
+; GISEL-NEXT: v_sub_i32_e64 v0, s[6:7], v15, v28
+; GISEL-NEXT: v_subb_u32_e64 v1, s[6:7], v1, v28, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[8:9], v5, v20, v[3:4]
+; GISEL-NEXT: v_sub_i32_e64 v4, s[8:9], v16, v33
+; GISEL-NEXT: v_subb_u32_e64 v5, s[8:9], v14, v33, s[8:9]
+; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v8, v2, vcc
+; GISEL-NEXT: v_subb_u32_e32 v8, vcc, v9, v10, vcc
+; GISEL-NEXT: v_xor_b32_e32 v2, v2, v28
+; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v23, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v7, v3, vcc
+; GISEL-NEXT: v_xor_b32_e32 v6, v6, v33
+; GISEL-NEXT: v_xor_b32_e32 v7, v8, v28
+; GISEL-NEXT: v_xor_b32_e32 v8, v3, v33
+; GISEL-NEXT: v_subb_u32_e64 v2, vcc, v2, v28, s[6:7]
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v7, v28, vcc
+; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v33, s[8:9]
+; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v8, v33, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%shl = srem <2 x i128> %lhs, %rhs
ret <2 x i128> %shl
}
define <2 x i128> @v_urem_v2i128_vv(<2 x i128> %lhs, <2 x i128> %rhs) {
+; SDAG-LABEL: v_urem_v2i128_vv:
+; SDAG: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_or_b32_e32 v17, v9, v11
+; SDAG-NEXT: v_or_b32_e32 v16, v8, v10
+; SDAG-NEXT: v_or_b32_e32 v19, v1, v3
+; SDAG-NEXT: v_or_b32_e32 v18, v0, v2
+; SDAG-NEXT: v_ffbh_u32_e32 v20, v10
+; SDAG-NEXT: v_ffbh_u32_e32 v21, v11
+; SDAG-NEXT: v_ffbh_u32_e32 v22, v8
+; SDAG-NEXT: v_ffbh_u32_e32 v23, v9
+; SDAG-NEXT: v_ffbh_u32_e32 v24, v2
+; SDAG-NEXT: v_ffbh_u32_e32 v25, v3
+; SDAG-NEXT: v_ffbh_u32_e32 v26, v0
+; SDAG-NEXT: v_ffbh_u32_e32 v27, v1
+; SDAG-NEXT: v_mov_b32_e32 v28, 0
+; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
+; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20
+; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v22
+; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v24
+; SDAG-NEXT: v_add_i32_e64 v19, s[6:7], 32, v26
+; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
+; SDAG-NEXT: v_min_u32_e32 v16, v16, v21
+; SDAG-NEXT: v_min_u32_e32 v17, v17, v23
+; SDAG-NEXT: v_min_u32_e32 v18, v18, v25
+; SDAG-NEXT: v_min_u32_e32 v19, v19, v27
+; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17
+; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, vcc
+; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19
+; SDAG-NEXT: v_addc_u32_e64 v21, s[4:5], 0, 0, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[10:11]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, v20, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc
+; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v18
+; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v20, v17, vcc
+; SDAG-NEXT: v_xor_b32_e32 v18, 0x7f, v16
+; SDAG-NEXT: v_subbrev_u32_e32 v20, vcc, 0, v28, vcc
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17]
+; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
+; SDAG-NEXT: v_subbrev_u32_e32 v21, vcc, 0, v28, vcc
+; SDAG-NEXT: v_or_b32_e32 v18, v18, v20
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21]
+; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v19, v17, v21
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21]
+; SDAG-NEXT: v_cndmask_b32_e32 v22, v23, v22, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT: v_and_b32_e32 v18, 1, v22
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v18
+; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v33, v3, 0, s[4:5]
+; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT: v_cndmask_b32_e64 v31, v2, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v30, v1, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v32, v0, 0, s[4:5]
+; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB3_6
+; SDAG-NEXT: ; %bb.1: ; %udiv-bb15
+; SDAG-NEXT: v_add_i32_e32 v30, vcc, 1, v16
+; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v16
+; SDAG-NEXT: v_mov_b32_e32 v18, 0
+; SDAG-NEXT: v_mov_b32_e32 v19, 0
+; SDAG-NEXT: v_addc_u32_e32 v31, vcc, 0, v17, vcc
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[0:1], v22
+; SDAG-NEXT: v_addc_u32_e32 v32, vcc, 0, v20, vcc
+; SDAG-NEXT: v_addc_u32_e32 v33, vcc, 0, v21, vcc
+; SDAG-NEXT: v_or_b32_e32 v20, v30, v32
+; SDAG-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v16
+; SDAG-NEXT: v_or_b32_e32 v21, v31, v33
+; SDAG-NEXT: v_lshl_b64 v[16:17], v[2:3], v26
+; SDAG-NEXT: v_sub_i32_e32 v27, vcc, 64, v26
+; SDAG-NEXT: v_lshl_b64 v[24:25], v[0:1], v26
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21]
+; SDAG-NEXT: v_lshr_b64 v[20:21], v[0:1], v27
+; SDAG-NEXT: v_or_b32_e32 v17, v17, v21
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v20
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v26
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v23, v17, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v22, v16, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, v25, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, v24, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v26
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v17, v3, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, v16, v2, s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
+; SDAG-NEXT: v_mov_b32_e32 v21, 0
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB3_5
+; SDAG-NEXT: ; %bb.2: ; %udiv-preheader4
+; SDAG-NEXT: v_lshr_b64 v[18:19], v[0:1], v30
+; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 64, v30
+; SDAG-NEXT: v_subrev_i32_e32 v35, vcc, 64, v30
+; SDAG-NEXT: v_lshr_b64 v[26:27], v[2:3], v30
+; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v8
+; SDAG-NEXT: s_mov_b64 s[10:11], 0
+; SDAG-NEXT: v_mov_b32_e32 v24, 0
+; SDAG-NEXT: v_mov_b32_e32 v25, 0
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
+; SDAG-NEXT: v_mov_b32_e32 v21, 0
+; SDAG-NEXT: v_lshl_b64 v[28:29], v[2:3], v28
+; SDAG-NEXT: v_lshr_b64 v[37:38], v[2:3], v35
+; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v9, vcc
+; SDAG-NEXT: v_or_b32_e32 v19, v19, v29
+; SDAG-NEXT: v_or_b32_e32 v18, v18, v28
+; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v10, vcc
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v30
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v38, v19, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v37, v18, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v29, 0, v27, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v28, 0, v26, s[4:5]
+; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v11, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30
+; SDAG-NEXT: v_cndmask_b32_e32 v27, v19, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v26, v18, v0, vcc
+; SDAG-NEXT: v_mov_b32_e32 v19, 0
+; SDAG-NEXT: .LBB3_3: ; %udiv-do-while3
+; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT: v_lshl_b64 v[28:29], v[28:29], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v18, 31, v27
+; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v38, 31, v17
+; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v39, 31, v23
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1
+; SDAG-NEXT: v_or_b32_e32 v28, v28, v18
+; SDAG-NEXT: v_or_b32_e32 v26, v26, v38
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v39
+; SDAG-NEXT: v_or_b32_e32 v17, v21, v17
+; SDAG-NEXT: v_sub_i32_e32 v18, vcc, v34, v26
+; SDAG-NEXT: v_or_b32_e32 v16, v20, v16
+; SDAG-NEXT: v_subb_u32_e32 v18, vcc, v35, v27, vcc
+; SDAG-NEXT: v_subb_u32_e32 v18, vcc, v36, v28, vcc
+; SDAG-NEXT: v_subb_u32_e32 v18, vcc, v37, v29, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v38, 31, v18
+; SDAG-NEXT: v_and_b32_e32 v39, v38, v8
+; SDAG-NEXT: v_and_b32_e32 v48, v38, v9
+; SDAG-NEXT: v_and_b32_e32 v49, v38, v10
+; SDAG-NEXT: v_and_b32_e32 v18, 1, v38
+; SDAG-NEXT: v_and_b32_e32 v38, v38, v11
+; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v26, v39
+; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v48, vcc
+; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v28, v49, vcc
+; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v29, v38, vcc
+; SDAG-NEXT: v_add_i32_e32 v30, vcc, -1, v30
+; SDAG-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc
+; SDAG-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc
+; SDAG-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc
+; SDAG-NEXT: v_or_b32_e32 v38, v30, v32
+; SDAG-NEXT: v_or_b32_e32 v39, v31, v33
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[38:39]
+; SDAG-NEXT: v_or_b32_e32 v23, v25, v23
+; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; SDAG-NEXT: v_or_b32_e32 v22, v24, v22
+; SDAG-NEXT: v_mov_b32_e32 v25, v19
+; SDAG-NEXT: v_mov_b32_e32 v24, v18
+; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; SDAG-NEXT: s_cbranch_execnz .LBB3_3
+; SDAG-NEXT: ; %bb.4: ; %Flow13
+; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
+; SDAG-NEXT: .LBB3_5: ; %Flow14
+; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v23
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[22:23], 1
+; SDAG-NEXT: v_or_b32_e32 v16, v16, v24
+; SDAG-NEXT: v_or_b32_e32 v33, v21, v17
+; SDAG-NEXT: v_or_b32_e32 v30, v19, v23
+; SDAG-NEXT: v_or_b32_e32 v31, v20, v16
+; SDAG-NEXT: v_or_b32_e32 v32, v18, v22
+; SDAG-NEXT: .LBB3_6: ; %Flow16
+; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT: v_or_b32_e32 v17, v13, v15
+; SDAG-NEXT: v_or_b32_e32 v16, v12, v14
+; SDAG-NEXT: v_or_b32_e32 v19, v5, v7
+; SDAG-NEXT: v_or_b32_e32 v18, v4, v6
+; SDAG-NEXT: v_ffbh_u32_e32 v20, v14
+; SDAG-NEXT: v_ffbh_u32_e32 v21, v15
+; SDAG-NEXT: v_ffbh_u32_e32 v22, v12
+; SDAG-NEXT: v_ffbh_u32_e32 v23, v13
+; SDAG-NEXT: v_ffbh_u32_e32 v24, v6
+; SDAG-NEXT: v_ffbh_u32_e32 v25, v7
+; SDAG-NEXT: v_ffbh_u32_e32 v26, v4
+; SDAG-NEXT: v_ffbh_u32_e32 v27, v5
+; SDAG-NEXT: v_mov_b32_e32 v28, 0
+; SDAG-NEXT: s_mov_b64 s[8:9], 0x7f
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
+; SDAG-NEXT: v_add_i32_e64 v16, s[6:7], 32, v20
+; SDAG-NEXT: v_add_i32_e64 v17, s[6:7], 32, v22
+; SDAG-NEXT: v_add_i32_e64 v18, s[6:7], 32, v24
+; SDAG-NEXT: v_add_i32_e64 v19, s[6:7], 32, v26
+; SDAG-NEXT: s_or_b64 s[6:7], vcc, s[4:5]
+; SDAG-NEXT: v_min_u32_e32 v16, v16, v21
+; SDAG-NEXT: v_min_u32_e32 v17, v17, v23
+; SDAG-NEXT: v_min_u32_e32 v18, v18, v25
+; SDAG-NEXT: v_min_u32_e32 v19, v19, v27
+; SDAG-NEXT: v_add_i32_e32 v17, vcc, 64, v17
+; SDAG-NEXT: v_addc_u32_e64 v20, s[4:5], 0, 0, vcc
+; SDAG-NEXT: v_add_i32_e32 v19, vcc, 64, v19
+; SDAG-NEXT: v_addc_u32_e64 v21, s[4:5], 0, 0, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[14:15]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, v20, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v17, v21, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v18, v19, v18, vcc
+; SDAG-NEXT: v_sub_i32_e32 v16, vcc, v16, v18
+; SDAG-NEXT: v_subb_u32_e32 v17, vcc, v20, v17, vcc
+; SDAG-NEXT: v_xor_b32_e32 v20, 0x7f, v16
+; SDAG-NEXT: v_subbrev_u32_e32 v18, vcc, 0, v28, vcc
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[8:9], v[16:17]
+; SDAG-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
+; SDAG-NEXT: v_subbrev_u32_e32 v19, vcc, 0, v28, vcc
+; SDAG-NEXT: v_or_b32_e32 v20, v20, v18
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v21, v17, v19
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; SDAG-NEXT: v_cndmask_b32_e32 v22, v23, v22, vcc
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[20:21]
+; SDAG-NEXT: v_and_b32_e32 v20, 1, v22
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v20
+; SDAG-NEXT: s_or_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v23, v7, 0, s[4:5]
+; SDAG-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; SDAG-NEXT: v_cndmask_b32_e64 v22, v6, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v21, v5, 0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, v4, 0, s[4:5]
+; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], vcc
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB3_12
+; SDAG-NEXT: ; %bb.7: ; %udiv-bb1
+; SDAG-NEXT: v_add_i32_e32 v34, vcc, 1, v16
+; SDAG-NEXT: v_sub_i32_e64 v22, s[4:5], 63, v16
+; SDAG-NEXT: v_mov_b32_e32 v20, 0
+; SDAG-NEXT: v_mov_b32_e32 v21, 0
+; SDAG-NEXT: v_addc_u32_e32 v35, vcc, 0, v17, vcc
+; SDAG-NEXT: v_lshl_b64 v[22:23], v[4:5], v22
+; SDAG-NEXT: v_addc_u32_e32 v36, vcc, 0, v18, vcc
+; SDAG-NEXT: v_addc_u32_e32 v37, vcc, 0, v19, vcc
+; SDAG-NEXT: v_or_b32_e32 v17, v34, v36
+; SDAG-NEXT: v_sub_i32_e32 v19, vcc, 0x7f, v16
+; SDAG-NEXT: v_or_b32_e32 v18, v35, v37
+; SDAG-NEXT: v_lshl_b64 v[24:25], v[6:7], v19
+; SDAG-NEXT: v_sub_i32_e32 v16, vcc, 64, v19
+; SDAG-NEXT: v_lshl_b64 v[26:27], v[4:5], v19
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[17:18]
+; SDAG-NEXT: v_lshr_b64 v[16:17], v[4:5], v16
+; SDAG-NEXT: v_or_b32_e32 v17, v25, v17
+; SDAG-NEXT: v_or_b32_e32 v16, v24, v16
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v19
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v23, v17, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v22, v22, v16, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v17, 0, v27, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v16, 0, v26, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v19
+; SDAG-NEXT: v_cndmask_b32_e64 v19, v18, v7, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v18, v22, v6, s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v22, 0
+; SDAG-NEXT: v_mov_b32_e32 v23, 0
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[8:9], exec, s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB3_11
+; SDAG-NEXT: ; %bb.8: ; %udiv-preheader
+; SDAG-NEXT: v_lshr_b64 v[20:21], v[4:5], v34
+; SDAG-NEXT: v_sub_i32_e32 v28, vcc, 64, v34
+; SDAG-NEXT: v_subrev_i32_e32 v39, vcc, 64, v34
+; SDAG-NEXT: v_lshr_b64 v[26:27], v[6:7], v34
+; SDAG-NEXT: v_add_i32_e32 v38, vcc, -1, v12
+; SDAG-NEXT: s_mov_b64 s[10:11], 0
+; SDAG-NEXT: v_mov_b32_e32 v24, 0
+; SDAG-NEXT: v_mov_b32_e32 v25, 0
+; SDAG-NEXT: v_mov_b32_e32 v22, 0
+; SDAG-NEXT: v_mov_b32_e32 v23, 0
+; SDAG-NEXT: v_lshl_b64 v[28:29], v[6:7], v28
+; SDAG-NEXT: v_lshr_b64 v[49:50], v[6:7], v39
+; SDAG-NEXT: v_addc_u32_e32 v39, vcc, -1, v13, vcc
+; SDAG-NEXT: v_or_b32_e32 v21, v21, v29
+; SDAG-NEXT: v_or_b32_e32 v20, v20, v28
+; SDAG-NEXT: v_addc_u32_e32 v48, vcc, -1, v14, vcc
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v34
+; SDAG-NEXT: v_cndmask_b32_e64 v21, v50, v21, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v20, v49, v20, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v29, 0, v27, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v28, 0, v26, s[4:5]
+; SDAG-NEXT: v_addc_u32_e32 v49, vcc, -1, v15, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34
+; SDAG-NEXT: v_cndmask_b32_e32 v27, v21, v5, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v26, v20, v4, vcc
+; SDAG-NEXT: v_mov_b32_e32 v21, 0
+; SDAG-NEXT: .LBB3_9: ; %udiv-do-while
+; SDAG-NEXT: ; =>This Inner Loop Header: Depth=1
+; SDAG-NEXT: v_lshl_b64 v[28:29], v[28:29], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v20, 31, v27
+; SDAG-NEXT: v_lshl_b64 v[26:27], v[26:27], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v50, 31, v19
+; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v51, 31, v17
+; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
+; SDAG-NEXT: v_or_b32_e32 v28, v28, v20
+; SDAG-NEXT: v_or_b32_e32 v26, v26, v50
+; SDAG-NEXT: v_or_b32_e32 v18, v18, v51
+; SDAG-NEXT: v_or_b32_e32 v19, v23, v19
+; SDAG-NEXT: v_or_b32_e32 v17, v25, v17
+; SDAG-NEXT: v_or_b32_e32 v18, v22, v18
+; SDAG-NEXT: v_sub_i32_e32 v20, vcc, v38, v26
+; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v39, v27, vcc
+; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v48, v28, vcc
+; SDAG-NEXT: v_subb_u32_e32 v20, vcc, v49, v29, vcc
+; SDAG-NEXT: v_ashrrev_i32_e32 v25, 31, v20
+; SDAG-NEXT: v_and_b32_e32 v20, 1, v25
+; SDAG-NEXT: v_and_b32_e32 v50, v25, v15
+; SDAG-NEXT: v_and_b32_e32 v51, v25, v14
+; SDAG-NEXT: v_and_b32_e32 v52, v25, v13
+; SDAG-NEXT: v_and_b32_e32 v25, v25, v12
+; SDAG-NEXT: v_sub_i32_e32 v26, vcc, v26, v25
+; SDAG-NEXT: v_subb_u32_e32 v27, vcc, v27, v52, vcc
+; SDAG-NEXT: v_subb_u32_e32 v28, vcc, v28, v51, vcc
+; SDAG-NEXT: v_subb_u32_e32 v29, vcc, v29, v50, vcc
+; SDAG-NEXT: v_add_i32_e32 v34, vcc, -1, v34
+; SDAG-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc
+; SDAG-NEXT: v_addc_u32_e32 v36, vcc, -1, v36, vcc
+; SDAG-NEXT: v_addc_u32_e32 v37, vcc, -1, v37, vcc
+; SDAG-NEXT: v_or_b32_e32 v51, v35, v37
+; SDAG-NEXT: v_or_b32_e32 v50, v34, v36
+; SDAG-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[50:51]
+; SDAG-NEXT: s_or_b64 s[10:11], vcc, s[10:11]
+; SDAG-NEXT: v_or_b32_e32 v16, v24, v16
+; SDAG-NEXT: v_mov_b32_e32 v25, v21
+; SDAG-NEXT: v_mov_b32_e32 v24, v20
+; SDAG-NEXT: s_andn2_b64 exec, exec, s[10:11]
+; SDAG-NEXT: s_cbranch_execnz .LBB3_9
+; SDAG-NEXT: ; %bb.10: ; %Flow
+; SDAG-NEXT: s_or_b64 exec, exec, s[10:11]
+; SDAG-NEXT: .LBB3_11: ; %Flow11
+; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT: v_lshl_b64 v[18:19], v[18:19], 1
+; SDAG-NEXT: v_lshrrev_b32_e32 v24, 31, v17
+; SDAG-NEXT: v_lshl_b64 v[16:17], v[16:17], 1
+; SDAG-NEXT: v_or_b32_e32 v18, v18, v24
+; SDAG-NEXT: v_or_b32_e32 v23, v23, v19
+; SDAG-NEXT: v_or_b32_e32 v21, v21, v17
+; SDAG-NEXT: v_or_b32_e32 v22, v22, v18
+; SDAG-NEXT: v_or_b32_e32 v20, v20, v16
+; SDAG-NEXT: .LBB3_12: ; %Flow12
+; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT: v_mul_lo_u32 v18, v32, v11
+; SDAG-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v32, v10, 0
+; SDAG-NEXT: v_mul_lo_u32 v28, v30, v10
+; SDAG-NEXT: v_mul_lo_u32 v29, v33, v8
+; SDAG-NEXT: v_mul_lo_u32 v33, v31, v9
+; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v32, 0
+; SDAG-NEXT: v_mov_b32_e32 v19, 0
+; SDAG-NEXT: v_mul_lo_u32 v34, v20, v15
+; SDAG-NEXT: v_mad_u64_u32 v[24:25], s[4:5], v20, v14, 0
+; SDAG-NEXT: v_mul_lo_u32 v35, v21, v14
+; SDAG-NEXT: v_mul_lo_u32 v23, v23, v12
+; SDAG-NEXT: v_mul_lo_u32 v36, v22, v13
+; SDAG-NEXT: v_mad_u64_u32 v[14:15], s[4:5], v12, v20, 0
+; SDAG-NEXT: v_add_i32_e32 v17, vcc, v17, v18
+; SDAG-NEXT: v_mov_b32_e32 v18, v11
+; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v9, v32, v[18:19]
+; SDAG-NEXT: v_sub_i32_e32 v0, vcc, v0, v10
+; SDAG-NEXT: v_add_i32_e64 v18, s[4:5], v25, v34
+; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v17, v28
+; SDAG-NEXT: v_mov_b32_e32 v28, v27
+; SDAG-NEXT: v_mov_b32_e32 v27, v19
+; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v8, v30, v[26:27]
+; SDAG-NEXT: v_add_i32_e64 v25, s[4:5], v18, v35
+; SDAG-NEXT: v_mov_b32_e32 v18, v15
+; SDAG-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v13, v20, v[18:19]
+; SDAG-NEXT: v_mad_u64_u32 v[15:16], s[4:5], v31, v8, v[16:17]
+; SDAG-NEXT: v_mov_b32_e32 v8, v11
+; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v28, v8
+; SDAG-NEXT: v_addc_u32_e64 v18, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v8, v10
+; SDAG-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
+; SDAG-NEXT: v_mad_u64_u32 v[10:11], s[4:5], v22, v12, v[24:25]
+; SDAG-NEXT: v_mov_b32_e32 v22, v27
+; SDAG-NEXT: v_mov_b32_e32 v27, v19
+; SDAG-NEXT: v_mad_u64_u32 v[19:20], s[4:5], v12, v21, v[26:27]
+; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], v29, v16
+; SDAG-NEXT: v_mad_u64_u32 v[8:9], s[4:5], v9, v30, v[17:18]
+; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v23, v11
+; SDAG-NEXT: v_mov_b32_e32 v11, v20
+; SDAG-NEXT: v_add_i32_e64 v11, s[4:5], v22, v11
+; SDAG-NEXT: v_addc_u32_e64 v12, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_add_i32_e64 v16, s[4:5], v33, v16
+; SDAG-NEXT: v_add_i32_e64 v17, s[4:5], v36, v17
+; SDAG-NEXT: v_mad_u64_u32 v[11:12], s[4:5], v13, v21, v[11:12]
+; SDAG-NEXT: v_add_i32_e64 v8, s[4:5], v8, v15
+; SDAG-NEXT: v_addc_u32_e64 v9, s[4:5], v9, v16, s[4:5]
+; SDAG-NEXT: v_subb_u32_e32 v2, vcc, v2, v8, vcc
+; SDAG-NEXT: v_subb_u32_e32 v3, vcc, v3, v9, vcc
+; SDAG-NEXT: v_add_i32_e32 v8, vcc, v11, v10
+; SDAG-NEXT: v_addc_u32_e32 v9, vcc, v12, v17, vcc
+; SDAG-NEXT: v_mov_b32_e32 v10, v19
+; SDAG-NEXT: v_sub_i32_e32 v4, vcc, v4, v14
+; SDAG-NEXT: v_subb_u32_e32 v5, vcc, v5, v10, vcc
+; SDAG-NEXT: v_subb_u32_e32 v6, vcc, v6, v8, vcc
+; SDAG-NEXT: v_subb_u32_e32 v7, vcc, v7, v9, vcc
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: v_urem_v2i128_vv:
+; GISEL: ; %bb.0: ; %_udiv-special-cases_udiv-special-cases
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b64 s[8:9], 0
+; GISEL-NEXT: v_or_b32_e32 v16, v8, v10
+; GISEL-NEXT: v_or_b32_e32 v17, v9, v11
+; GISEL-NEXT: v_or_b32_e32 v18, v0, v2
+; GISEL-NEXT: v_or_b32_e32 v19, v1, v3
+; GISEL-NEXT: v_ffbh_u32_e32 v22, v9
+; GISEL-NEXT: v_ffbh_u32_e32 v23, v8
+; GISEL-NEXT: v_ffbh_u32_e32 v24, v11
+; GISEL-NEXT: v_ffbh_u32_e32 v25, v10
+; GISEL-NEXT: v_ffbh_u32_e32 v26, v1
+; GISEL-NEXT: v_ffbh_u32_e32 v27, v0
+; GISEL-NEXT: v_ffbh_u32_e32 v28, v3
+; GISEL-NEXT: v_ffbh_u32_e32 v29, v2
+; GISEL-NEXT: v_mov_b32_e32 v20, 0x7f
+; GISEL-NEXT: v_mov_b32_e32 v21, 0
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
+; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], 32, v23
+; GISEL-NEXT: v_add_i32_e64 v17, s[6:7], 32, v25
+; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], 32, v27
+; GISEL-NEXT: v_add_i32_e64 v19, s[6:7], 32, v29
+; GISEL-NEXT: v_min_u32_e32 v16, v22, v16
+; GISEL-NEXT: v_min_u32_e32 v17, v24, v17
+; GISEL-NEXT: v_min_u32_e32 v18, v26, v18
+; GISEL-NEXT: v_min_u32_e32 v19, v28, v19
+; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v22, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, 64, v16
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, 64, v18
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[10:11]
+; GISEL-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT: v_cndmask_b32_e32 v17, v19, v18, vcc
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v16, v17
+; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_subb_u32_e64 v18, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_subb_u32_e64 v19, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[16:17], v[20:21]
+; GISEL-NEXT: v_cndmask_b32_e64 v23, 0, 1, vcc
+; GISEL-NEXT: v_xor_b32_e32 v20, 0x7f, v16
+; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[18:19]
+; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v20, v20, v18
+; GISEL-NEXT: v_or_b32_e32 v21, v17, v19
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; GISEL-NEXT: v_cndmask_b32_e32 v23, v24, v23, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[20:21]
+; GISEL-NEXT: v_cndmask_b32_e64 v20, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v21, v22, v23
+; GISEL-NEXT: v_and_b32_e32 v22, 1, v21
+; GISEL-NEXT: v_or_b32_e32 v20, v21, v20
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22
+; GISEL-NEXT: v_cndmask_b32_e64 v32, v0, 0, vcc
+; GISEL-NEXT: v_and_b32_e32 v22, 1, v20
+; GISEL-NEXT: v_cndmask_b32_e64 v33, v1, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v20, v2, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v21, v3, 0, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v22
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB3_6
+; GISEL-NEXT: ; %bb.1: ; %udiv-bb15
+; GISEL-NEXT: v_add_i32_e32 v30, vcc, 1, v16
+; GISEL-NEXT: v_addc_u32_e64 v31, s[4:5], 0, v17, vcc
+; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 0x7f, v16
+; GISEL-NEXT: v_addc_u32_e64 v32, vcc, 0, v18, s[4:5]
+; GISEL-NEXT: v_addc_u32_e32 v33, vcc, 0, v19, vcc
+; GISEL-NEXT: v_subrev_i32_e64 v22, s[4:5], 64, v26
+; GISEL-NEXT: v_sub_i32_e64 v20, s[4:5], 64, v26
+; GISEL-NEXT: v_lshl_b64 v[16:17], v[0:1], v26
+; GISEL-NEXT: v_lshl_b64 v[18:19], v[2:3], v26
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: v_lshr_b64 v[20:21], v[0:1], v20
+; GISEL-NEXT: v_lshl_b64 v[24:25], v[0:1], v22
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v26
+; GISEL-NEXT: v_cndmask_b32_e32 v22, 0, v16, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v23, 0, v17, vcc
+; GISEL-NEXT: v_or_b32_e32 v16, v20, v18
+; GISEL-NEXT: v_or_b32_e32 v17, v21, v19
+; GISEL-NEXT: v_cndmask_b32_e32 v16, v24, v16, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v17, v25, v17, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v26
+; GISEL-NEXT: v_cndmask_b32_e32 v20, v16, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v21, v17, v3, vcc
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT: v_mov_b32_e32 v19, s11
+; GISEL-NEXT: v_mov_b32_e32 v18, s10
+; GISEL-NEXT: v_mov_b32_e32 v17, s9
+; GISEL-NEXT: v_mov_b32_e32 v16, s8
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
+; GISEL-NEXT: s_cbranch_execz .LBB3_5
+; GISEL-NEXT: ; %bb.2: ; %udiv-preheader4
+; GISEL-NEXT: v_subrev_i32_e32 v26, vcc, 64, v30
+; GISEL-NEXT: v_sub_i32_e32 v24, vcc, 64, v30
+; GISEL-NEXT: v_lshr_b64 v[16:17], v[2:3], v30
+; GISEL-NEXT: v_lshr_b64 v[18:19], v[0:1], v30
+; GISEL-NEXT: s_mov_b64 s[4:5], 0
+; GISEL-NEXT: v_add_i32_e32 v34, vcc, -1, v8
+; GISEL-NEXT: v_addc_u32_e32 v35, vcc, -1, v9, vcc
+; GISEL-NEXT: v_lshl_b64 v[24:25], v[2:3], v24
+; GISEL-NEXT: v_lshr_b64 v[26:27], v[2:3], v26
+; GISEL-NEXT: v_addc_u32_e32 v36, vcc, -1, v10, vcc
+; GISEL-NEXT: v_addc_u32_e32 v37, vcc, -1, v11, vcc
+; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT: v_or_b32_e32 v18, v18, v24
+; GISEL-NEXT: v_or_b32_e32 v19, v19, v25
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v30
+; GISEL-NEXT: v_cndmask_b32_e32 v18, v26, v18, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v19, v27, v19, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v28, 0, v16, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v29, 0, v17, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v30
+; GISEL-NEXT: v_cndmask_b32_e32 v26, v18, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v27, v19, v1, vcc
+; GISEL-NEXT: v_mov_b32_e32 v25, 0
+; GISEL-NEXT: v_mov_b32_e32 v19, s7
+; GISEL-NEXT: v_mov_b32_e32 v18, s6
+; GISEL-NEXT: v_mov_b32_e32 v17, s5
+; GISEL-NEXT: v_mov_b32_e32 v16, s4
+; GISEL-NEXT: .LBB3_3: ; %udiv-do-while3
+; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v24, 31, v23
+; GISEL-NEXT: v_lshl_b64 v[38:39], v[26:27], 1
+; GISEL-NEXT: v_lshl_b64 v[28:29], v[28:29], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v26, 31, v27
+; GISEL-NEXT: v_lshrrev_b32_e32 v27, 31, v21
+; GISEL-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
+; GISEL-NEXT: v_add_i32_e32 v30, vcc, -1, v30
+; GISEL-NEXT: v_addc_u32_e32 v31, vcc, -1, v31, vcc
+; GISEL-NEXT: v_or_b32_e32 v22, v16, v18
+; GISEL-NEXT: v_or_b32_e32 v23, v17, v19
+; GISEL-NEXT: v_or_b32_e32 v18, v28, v26
+; GISEL-NEXT: v_or_b32_e32 v19, v38, v27
+; GISEL-NEXT: v_or_b32_e32 v20, v20, v24
+; GISEL-NEXT: v_addc_u32_e32 v32, vcc, -1, v32, vcc
+; GISEL-NEXT: v_addc_u32_e32 v33, vcc, -1, v33, vcc
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v34, v19
+; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v35, v39, vcc
+; GISEL-NEXT: v_or_b32_e32 v16, v30, v32
+; GISEL-NEXT: v_or_b32_e32 v17, v31, v33
+; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v36, v18, vcc
+; GISEL-NEXT: v_subb_u32_e32 v24, vcc, v37, v29, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT: v_ashrrev_i32_e32 v16, 31, v24
+; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: v_and_b32_e32 v24, 1, v16
+; GISEL-NEXT: v_and_b32_e32 v17, v16, v8
+; GISEL-NEXT: v_and_b32_e32 v27, v16, v9
+; GISEL-NEXT: v_and_b32_e32 v28, v16, v10
+; GISEL-NEXT: v_and_b32_e32 v16, v16, v11
+; GISEL-NEXT: v_sub_i32_e32 v26, vcc, v19, v17
+; GISEL-NEXT: v_subb_u32_e32 v27, vcc, v39, v27, vcc
+; GISEL-NEXT: v_subb_u32_e32 v28, vcc, v18, v28, vcc
+; GISEL-NEXT: v_subb_u32_e32 v29, vcc, v29, v16, vcc
+; GISEL-NEXT: v_mov_b32_e32 v16, v24
+; GISEL-NEXT: v_mov_b32_e32 v17, v25
+; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GISEL-NEXT: s_cbranch_execnz .LBB3_3
+; GISEL-NEXT: ; %bb.4: ; %Flow13
+; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT: .LBB3_5: ; %Flow14
+; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1
+; GISEL-NEXT: v_lshl_b64 v[20:21], v[20:21], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v23
+; GISEL-NEXT: v_or_b32_e32 v20, v20, v22
+; GISEL-NEXT: v_or_b32_e32 v32, v16, v18
+; GISEL-NEXT: v_or_b32_e32 v33, v17, v19
+; GISEL-NEXT: .LBB3_6: ; %Flow16
+; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: s_mov_b64 s[8:9], 0
+; GISEL-NEXT: v_or_b32_e32 v16, v12, v14
+; GISEL-NEXT: v_or_b32_e32 v17, v13, v15
+; GISEL-NEXT: v_or_b32_e32 v18, v4, v6
+; GISEL-NEXT: v_or_b32_e32 v19, v5, v7
+; GISEL-NEXT: v_ffbh_u32_e32 v22, v13
+; GISEL-NEXT: v_ffbh_u32_e32 v23, v12
+; GISEL-NEXT: v_ffbh_u32_e32 v26, v15
+; GISEL-NEXT: v_ffbh_u32_e32 v27, v14
+; GISEL-NEXT: v_ffbh_u32_e32 v28, v5
+; GISEL-NEXT: v_ffbh_u32_e32 v29, v4
+; GISEL-NEXT: v_ffbh_u32_e32 v30, v7
+; GISEL-NEXT: v_ffbh_u32_e32 v31, v6
+; GISEL-NEXT: v_mov_b32_e32 v24, 0x7f
+; GISEL-NEXT: v_mov_b32_e32 v25, 0
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT: v_cmp_eq_u64_e64 s[4:5], 0, v[18:19]
+; GISEL-NEXT: v_add_i32_e64 v16, s[6:7], 32, v23
+; GISEL-NEXT: v_add_i32_e64 v17, s[6:7], 32, v27
+; GISEL-NEXT: v_add_i32_e64 v18, s[6:7], 32, v29
+; GISEL-NEXT: v_add_i32_e64 v19, s[6:7], 32, v31
+; GISEL-NEXT: v_min_u32_e32 v16, v22, v16
+; GISEL-NEXT: v_min_u32_e32 v17, v26, v17
+; GISEL-NEXT: v_min_u32_e32 v18, v28, v18
+; GISEL-NEXT: v_min_u32_e32 v19, v30, v19
+; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v26, 0, 1, s[4:5]
+; GISEL-NEXT: v_add_i32_e32 v16, vcc, 64, v16
+; GISEL-NEXT: v_add_i32_e32 v18, vcc, 64, v18
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[14:15]
+; GISEL-NEXT: v_cndmask_b32_e32 v16, v17, v16, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[6:7]
+; GISEL-NEXT: v_cndmask_b32_e32 v17, v19, v18, vcc
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v16, v17
+; GISEL-NEXT: v_subb_u32_e64 v17, s[4:5], 0, 0, vcc
+; GISEL-NEXT: v_subb_u32_e64 v22, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_subb_u32_e64 v23, s[4:5], 0, 0, s[4:5]
+; GISEL-NEXT: v_cmp_gt_u64_e32 vcc, v[16:17], v[24:25]
+; GISEL-NEXT: v_cndmask_b32_e64 v24, 0, 1, vcc
+; GISEL-NEXT: v_xor_b32_e32 v18, 0x7f, v16
+; GISEL-NEXT: v_cmp_lt_u64_e32 vcc, 0, v[22:23]
+; GISEL-NEXT: v_cndmask_b32_e64 v25, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v18, v18, v22
+; GISEL-NEXT: v_or_b32_e32 v19, v17, v23
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[22:23]
+; GISEL-NEXT: v_cndmask_b32_e32 v24, v25, v24, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[18:19]
+; GISEL-NEXT: v_cndmask_b32_e64 v18, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v19, v26, v24
+; GISEL-NEXT: v_and_b32_e32 v24, 1, v19
+; GISEL-NEXT: v_or_b32_e32 v18, v19, v18
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v24
+; GISEL-NEXT: v_cndmask_b32_e64 v24, v4, 0, vcc
+; GISEL-NEXT: v_and_b32_e32 v26, 1, v18
+; GISEL-NEXT: v_cndmask_b32_e64 v25, v5, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v18, v6, 0, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v19, v7, 0, vcc
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v26
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: s_and_saveexec_b64 s[12:13], s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB3_12
+; GISEL-NEXT: ; %bb.7: ; %udiv-bb1
+; GISEL-NEXT: v_add_i32_e32 v34, vcc, 1, v16
+; GISEL-NEXT: v_addc_u32_e64 v35, s[4:5], 0, v17, vcc
+; GISEL-NEXT: v_sub_i32_e32 v28, vcc, 0x7f, v16
+; GISEL-NEXT: v_addc_u32_e64 v36, vcc, 0, v22, s[4:5]
+; GISEL-NEXT: v_addc_u32_e32 v37, vcc, 0, v23, vcc
+; GISEL-NEXT: v_subrev_i32_e64 v24, s[4:5], 64, v28
+; GISEL-NEXT: v_sub_i32_e64 v22, s[4:5], 64, v28
+; GISEL-NEXT: v_lshl_b64 v[16:17], v[4:5], v28
+; GISEL-NEXT: v_lshl_b64 v[18:19], v[6:7], v28
+; GISEL-NEXT: s_xor_b64 s[4:5], vcc, -1
+; GISEL-NEXT: v_lshr_b64 v[22:23], v[4:5], v22
+; GISEL-NEXT: v_lshl_b64 v[26:27], v[4:5], v24
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v28
+; GISEL-NEXT: v_cndmask_b32_e32 v24, 0, v16, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v25, 0, v17, vcc
+; GISEL-NEXT: v_or_b32_e32 v16, v22, v18
+; GISEL-NEXT: v_or_b32_e32 v17, v23, v19
+; GISEL-NEXT: v_cndmask_b32_e32 v16, v26, v16, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v17, v27, v17, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v28
+; GISEL-NEXT: v_cndmask_b32_e32 v22, v16, v6, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v23, v17, v7, vcc
+; GISEL-NEXT: s_mov_b64 s[10:11], s[8:9]
+; GISEL-NEXT: v_mov_b32_e32 v19, s11
+; GISEL-NEXT: v_mov_b32_e32 v18, s10
+; GISEL-NEXT: v_mov_b32_e32 v17, s9
+; GISEL-NEXT: v_mov_b32_e32 v16, s8
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; GISEL-NEXT: s_xor_b64 s[8:9], exec, s[6:7]
+; GISEL-NEXT: s_cbranch_execz .LBB3_11
+; GISEL-NEXT: ; %bb.8: ; %udiv-preheader
+; GISEL-NEXT: v_subrev_i32_e32 v28, vcc, 64, v34
+; GISEL-NEXT: v_sub_i32_e32 v26, vcc, 64, v34
+; GISEL-NEXT: v_lshr_b64 v[16:17], v[6:7], v34
+; GISEL-NEXT: v_lshr_b64 v[18:19], v[4:5], v34
+; GISEL-NEXT: s_mov_b64 s[4:5], 0
+; GISEL-NEXT: v_add_i32_e32 v38, vcc, -1, v12
+; GISEL-NEXT: v_addc_u32_e32 v39, vcc, -1, v13, vcc
+; GISEL-NEXT: v_lshl_b64 v[26:27], v[6:7], v26
+; GISEL-NEXT: v_lshr_b64 v[28:29], v[6:7], v28
+; GISEL-NEXT: v_addc_u32_e32 v48, vcc, -1, v14, vcc
+; GISEL-NEXT: v_addc_u32_e32 v49, vcc, -1, v15, vcc
+; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT: v_or_b32_e32 v18, v18, v26
+; GISEL-NEXT: v_or_b32_e32 v19, v19, v27
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v34
+; GISEL-NEXT: v_cndmask_b32_e32 v18, v28, v18, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v19, v29, v19, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v30, 0, v16, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v31, 0, v17, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v34
+; GISEL-NEXT: v_cndmask_b32_e32 v28, v18, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v29, v19, v5, vcc
+; GISEL-NEXT: v_mov_b32_e32 v27, 0
+; GISEL-NEXT: v_mov_b32_e32 v19, s7
+; GISEL-NEXT: v_mov_b32_e32 v18, s6
+; GISEL-NEXT: v_mov_b32_e32 v17, s5
+; GISEL-NEXT: v_mov_b32_e32 v16, s4
+; GISEL-NEXT: .LBB3_9: ; %udiv-do-while
+; GISEL-NEXT: ; =>This Inner Loop Header: Depth=1
+; GISEL-NEXT: v_lshl_b64 v[18:19], v[24:25], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v26, 31, v25
+; GISEL-NEXT: v_lshl_b64 v[50:51], v[28:29], 1
+; GISEL-NEXT: v_lshl_b64 v[30:31], v[30:31], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v28, 31, v29
+; GISEL-NEXT: v_lshrrev_b32_e32 v29, 31, v23
+; GISEL-NEXT: v_lshl_b64 v[22:23], v[22:23], 1
+; GISEL-NEXT: v_add_i32_e32 v34, vcc, -1, v34
+; GISEL-NEXT: v_addc_u32_e32 v35, vcc, -1, v35, vcc
+; GISEL-NEXT: v_or_b32_e32 v24, v16, v18
+; GISEL-NEXT: v_or_b32_e32 v25, v17, v19
+; GISEL-NEXT: v_or_b32_e32 v18, v30, v28
+; GISEL-NEXT: v_or_b32_e32 v19, v50, v29
+; GISEL-NEXT: v_or_b32_e32 v22, v22, v26
+; GISEL-NEXT: v_addc_u32_e32 v36, vcc, -1, v36, vcc
+; GISEL-NEXT: v_addc_u32_e32 v37, vcc, -1, v37, vcc
+; GISEL-NEXT: v_sub_i32_e32 v16, vcc, v38, v19
+; GISEL-NEXT: v_subb_u32_e32 v16, vcc, v39, v51, vcc
+; GISEL-NEXT: v_or_b32_e32 v16, v34, v36
+; GISEL-NEXT: v_or_b32_e32 v17, v35, v37
+; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v48, v18, vcc
+; GISEL-NEXT: v_subb_u32_e32 v26, vcc, v49, v31, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[16:17]
+; GISEL-NEXT: v_ashrrev_i32_e32 v16, 31, v26
+; GISEL-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GISEL-NEXT: v_and_b32_e32 v26, 1, v16
+; GISEL-NEXT: v_and_b32_e32 v17, v16, v12
+; GISEL-NEXT: v_and_b32_e32 v29, v16, v13
+; GISEL-NEXT: v_and_b32_e32 v30, v16, v14
+; GISEL-NEXT: v_and_b32_e32 v50, v16, v15
+; GISEL-NEXT: v_sub_i32_e32 v28, vcc, v19, v17
+; GISEL-NEXT: v_subb_u32_e32 v29, vcc, v51, v29, vcc
+; GISEL-NEXT: v_mov_b32_e32 v16, v26
+; GISEL-NEXT: v_mov_b32_e32 v17, v27
+; GISEL-NEXT: v_subb_u32_e32 v30, vcc, v18, v30, vcc
+; GISEL-NEXT: v_subb_u32_e32 v31, vcc, v31, v50, vcc
+; GISEL-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GISEL-NEXT: s_cbranch_execnz .LBB3_9
+; GISEL-NEXT: ; %bb.10: ; %Flow
+; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT: .LBB3_11: ; %Flow11
+; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT: v_lshl_b64 v[26:27], v[24:25], 1
+; GISEL-NEXT: v_lshl_b64 v[18:19], v[22:23], 1
+; GISEL-NEXT: v_lshrrev_b32_e32 v22, 31, v25
+; GISEL-NEXT: v_or_b32_e32 v18, v18, v22
+; GISEL-NEXT: v_or_b32_e32 v24, v16, v26
+; GISEL-NEXT: v_or_b32_e32 v25, v17, v27
+; GISEL-NEXT: .LBB3_12: ; %Flow12
+; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[4:5], v8, v32, 0
+; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v8, v20, 0
+; GISEL-NEXT: v_mul_lo_u32 v28, v8, v21
+; GISEL-NEXT: v_mul_lo_u32 v29, v9, v20
+; GISEL-NEXT: v_mad_u64_u32 v[20:21], s[4:5], v12, v24, 0
+; GISEL-NEXT: v_mad_u64_u32 v[26:27], s[4:5], v12, v18, 0
+; GISEL-NEXT: v_mul_lo_u32 v30, v12, v19
+; GISEL-NEXT: v_mul_lo_u32 v31, v13, v18
+; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v9, v33, v[22:23]
+; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v13, v25, v[26:27]
+; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[4:5], v10, v32, v[18:19]
+; GISEL-NEXT: v_mad_u64_u32 v[22:23], s[4:5], v14, v24, v[22:23]
+; GISEL-NEXT: v_mad_u64_u32 v[17:18], vcc, v8, v33, v[17:18]
+; GISEL-NEXT: v_mad_u64_u32 v[21:22], s[4:5], v12, v25, v[21:22]
+; GISEL-NEXT: v_mad_u64_u32 v[8:9], s[6:7], v9, v32, v[17:18]
+; GISEL-NEXT: v_addc_u32_e64 v17, s[6:7], v19, v28, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[12:13], s[6:7], v13, v24, v[21:22]
+; GISEL-NEXT: v_addc_u32_e64 v18, s[6:7], v23, v30, s[6:7]
+; GISEL-NEXT: v_addc_u32_e32 v17, vcc, v17, v29, vcc
+; GISEL-NEXT: v_sub_i32_e32 v0, vcc, v0, v16
+; GISEL-NEXT: v_subb_u32_e32 v1, vcc, v1, v8, vcc
+; GISEL-NEXT: v_addc_u32_e64 v8, s[4:5], v18, v31, s[4:5]
+; GISEL-NEXT: v_sub_i32_e64 v4, s[4:5], v4, v20
+; GISEL-NEXT: v_subb_u32_e64 v5, s[4:5], v5, v12, s[4:5]
+; GISEL-NEXT: v_mad_u64_u32 v[16:17], s[6:7], v10, v33, v[17:18]
+; GISEL-NEXT: v_mad_u64_u32 v[18:19], s[6:7], v14, v25, v[8:9]
+; GISEL-NEXT: v_mad_u64_u32 v[10:11], s[6:7], v11, v32, v[16:17]
+; GISEL-NEXT: v_mad_u64_u32 v[11:12], s[6:7], v15, v24, v[18:19]
+; GISEL-NEXT: v_subb_u32_e32 v2, vcc, v2, v9, vcc
+; GISEL-NEXT: v_subb_u32_e32 v3, vcc, v3, v10, vcc
+; GISEL-NEXT: v_subb_u32_e64 v6, vcc, v6, v13, s[4:5]
+; GISEL-NEXT: v_subb_u32_e32 v7, vcc, v7, v11, vcc
+; GISEL-NEXT: s_setpc_b64 s[30:31]
%shl = urem <2 x i128> %lhs, %rhs
ret <2 x i128> %shl
}
diff --git a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
index 0069370..05558c5 100644
--- a/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
+++ b/llvm/test/CodeGen/AMDGPU/duplicate-attribute-indirect.ll
@@ -42,6 +42,6 @@ attributes #0 = { "amdgpu-no-dispatch-id" }
;.
; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-no-dispatch-id" "amdgpu-stack-objects" }
;.
-; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "amdgpu-no-dispatch-id" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
index 4ed1b8a..e198197 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize-elimination.ll
@@ -471,25 +471,15 @@ define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_iee
ret void
}
-; GCN-LABEL: test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode:
-; VI-FLUSH: v_mul_f32_e32 v{{[0-9]+}}, 1.0, v{{[0-9]+}}
-; GCN-DENORM-NOT: v_max
-; GCN-DENORM-NOT: v_mul
-
-; GCN: v_min_f32_e32 v{{[0-9]+}}, 0, v{{[0-9]+}}
-; GCN-DENORM-NOT: v_max
-; GCN-DENORM-NOT: v_mul
-
-; GFX9: {{flat|global}}_store_dword
-define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(ptr addrspace(1) %arg) #1 {
- %id = tail call i32 @llvm.amdgcn.workitem.id.x()
- %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
- %load = load float, ptr addrspace(1) %gep, align 4
- %v = tail call float @llvm.minnum.f32(float %load, float 0.0)
- %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
- store float %canonicalized, ptr addrspace(1) %gep, align 4
- ret void
-}
+; define amdgpu_kernel void @test_fold_canonicalize_minnum_value_from_load_f32_nnan_ieee_mode(ptr addrspace(1) %arg) #1 {
+; %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+; %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+; %load = load float, ptr addrspace(1) %gep, align 4
+; %v = tail call float @llvm.minnum.f32(float %load, float 0.0)
+; %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+; store float %canonicalized, ptr addrspace(1) %gep, align 4
+; ret void
+; }
; GCN-LABEL: test_fold_canonicalize_minnum_value_f32:
; GCN: v_min_f32_e32 [[V:v[0-9]+]], 0, v{{[0-9]+}}
@@ -523,32 +513,15 @@ define amdgpu_kernel void @test_fold_canonicalize_sNaN_value_f32(ptr addrspace(1
ret void
}
-; GCN-LABEL: test_fold_canonicalize_denorm_value_f32:
-; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
-
-; GFX9-DENORM: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
-; GFX9-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[QUIET]]
-
-; GFX9-FLUSH: v_max_f32_e32 [[QUIET:v[0-9]+]], [[VAL]], [[VAL]]
-; GFX9-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET]]
-
-; VI-FLUSH: v_mul_f32_e32 [[QUIET_V0:v[0-9]+]], 1.0, [[VAL]]
-; VI-FLUSH: v_min_f32_e32 [[RESULT:v[0-9]+]], 0, [[QUIET_V0]]
-
-; VI-DENORM: v_min_f32_e32 [[RESULT:v[0-9]+]], 0x7fffff, [[VAL]]
-
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_dword v{{.+}}, [[RESULT]]
-define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(ptr addrspace(1) %arg) {
- %id = tail call i32 @llvm.amdgcn.workitem.id.x()
- %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
- %load = load float, ptr addrspace(1) %gep, align 4
- %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float))
- %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
- store float %canonicalized, ptr addrspace(1) %gep, align 4
- ret void
-}
+; define amdgpu_kernel void @test_fold_canonicalize_denorm_value_f32(ptr addrspace(1) %arg) {
+; %id = tail call i32 @llvm.amdgcn.workitem.id.x()
+; %gep = getelementptr inbounds float, ptr addrspace(1) %arg, i32 %id
+; %load = load float, ptr addrspace(1) %gep, align 4
+; %v = tail call float @llvm.minnum.f32(float %load, float bitcast (i32 8388607 to float))
+; %canonicalized = tail call float @llvm.canonicalize.f32(float %v)
+; store float %canonicalized, ptr addrspace(1) %gep, align 4
+; ret void
+; }
; GCN-LABEL: test_fold_canonicalize_maxnum_value_from_load_f32_ieee_mode:
; GCN: {{flat|global}}_load_dword [[VAL:v[0-9]+]]
@@ -674,10 +647,9 @@ define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f64(ptr addrsp
}
; GCN-LABEL: {{^}}test_fold_canonicalize_load_nnan_value_f16
-; GCN: {{flat|global}}_load_ushort [[V:v[0-9]+]],
-; GCN-NOT: v_mul
-; GCN-NOT: v_max
-; GCN: {{flat|global}}_store_short v{{.+}}, [[V]]
+; GCN: {{flat|global}}_load_ushort [[V1:v[0-9]+]],
+; GCN: v_max_f16_e32 [[V2:v[0-9]+]], [[V1]], [[V1]]
+; GCN: {{flat|global}}_store_short v{{.+}}, [[V2]]
define amdgpu_kernel void @test_fold_canonicalize_load_nnan_value_f16(ptr addrspace(1) %arg, ptr addrspace(1) %out) #1 {
%id = tail call i32 @llvm.amdgcn.workitem.id.x()
%gep = getelementptr inbounds half, ptr addrspace(1) %arg, i32 %id
@@ -807,18 +779,13 @@ define half @v_test_canonicalize_extract_element_v2f16(<2 x half> %vec) {
ret half %canonicalized
}
-; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_v2f16:
-; GFX9: v_mul_f16_e32
-; GFX9: v_pk_mul_f16
-; GFX9-NOT: v_max
-; GFX9-NOT: v_pk_max
-define <2 x half> @v_test_canonicalize_insertelement_v2f16(<2 x half> %vec, half %val, i32 %idx) {
- %vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0>
- %ins.op = fmul half %val, 8.0
- %ins = insertelement <2 x half> %vec.op, half %ins.op, i32 %idx
- %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
- ret <2 x half> %canonicalized
-}
+; define <2 x half> @v_test_canonicalize_insertelement_v2f16(<2 x half> %vec, half %val, i32 %idx) {
+; %vec.op = fmul <2 x half> %vec, <half 4.0, half 4.0>
+; %ins.op = fmul half %val, 8.0
+; %ins = insertelement <2 x half> %vec.op, half %ins.op, i32 %idx
+; %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %ins)
+; ret <2 x half> %canonicalized
+; }
; GCN-LABEL: {{^}}v_test_canonicalize_insertelement_noncanon_vec_v2f16:
; GFX9: v_mul_f16
@@ -842,15 +809,11 @@ define <2 x half> @v_test_canonicalize_insertelement_noncanon_insval_v2f16(<2 x
ret <2 x half> %canonicalized
}
-; GCN-LABEL: {{^}}v_test_canonicalize_cvt_pkrtz:
-; GCN: s_waitcnt
-; GCN-NEXT: v_cvt_pkrtz_f16_f32 v0, v0, v1
-; GCN-NEXT: s_setpc_b64
-define <2 x half> @v_test_canonicalize_cvt_pkrtz(float %a, float %b) {
- %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
- %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %cvt)
- ret <2 x half> %canonicalized
-}
+; define <2 x half> @v_test_canonicalize_cvt_pkrtz(float %a, float %b) {
+; %cvt = call <2 x half> @llvm.amdgcn.cvt.pkrtz(float %a, float %b)
+; %canonicalized = call <2 x half> @llvm.canonicalize.v2f16(<2 x half> %cvt)
+; ret <2 x half> %canonicalized
+; }
; GCN-LABEL: {{^}}v_test_canonicalize_cubeid:
; GCN: s_waitcnt
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
index 27462130..581b7b4 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.f16.ll
@@ -94,7 +94,6 @@ define amdgpu_kernel void @v_test_canonicalize_var_f16(ptr addrspace(1) %out) #1
; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
@@ -147,7 +146,6 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1
; CI-NEXT: s_waitcnt lgkmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e32 v0, s2
; CI-NEXT: s_mov_b32 s2, -1
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
@@ -170,6 +168,35 @@ define amdgpu_kernel void @s_test_canonicalize_var_f16(ptr addrspace(1) %out, i1
ret void
}
+define half @s_test_canonicalize_arg(half %x) #1 {
+; VI-LABEL: s_test_canonicalize_arg:
+; VI: ; %bb.0:
+; VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; VI-NEXT: v_max_f16_e32 v0, v0, v0
+; VI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: s_test_canonicalize_arg:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX9-NEXT: s_setpc_b64 s[30:31]
+;
+; CI-LABEL: s_test_canonicalize_arg:
+; CI: ; %bb.0:
+; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
+; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; CI-NEXT: s_setpc_b64 s[30:31]
+;
+; GFX11-LABEL: s_test_canonicalize_arg:
+; GFX11: ; %bb.0:
+; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-NEXT: v_max_f16_e32 v0, v0, v0
+; GFX11-NEXT: s_setpc_b64 s[30:31]
+ %canonicalized = call half @llvm.canonicalize.f16(half %x)
+ ret half %canonicalized
+}
+
define <2 x half> @v_test_canonicalize_build_vector_v2f16(half %lo, half %hi) #1 {
; VI-LABEL: v_test_canonicalize_build_vector_v2f16:
; VI: ; %bb.0:
@@ -242,7 +269,6 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_f16(ptr addrspace(1) %ou
; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
@@ -299,7 +325,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_f16(ptr addrspace(1
; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
@@ -357,7 +382,6 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_f16(ptr addrspace(1) %ou
; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
@@ -414,7 +438,6 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_var_f16(ptr add
; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
@@ -471,7 +494,6 @@ define amdgpu_kernel void @v_test_no_denormals_canonicalize_fneg_fabs_var_f16(pt
; CI-NEXT: buffer_load_ushort v0, off, s[0:3], 0
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f32_f16_e64 v0, -|v0|
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: buffer_store_short v0, off, s[0:3], 0
; CI-NEXT: s_endpgm
@@ -1246,9 +1268,7 @@ define amdgpu_kernel void @v_test_canonicalize_var_v2f16(ptr addrspace(1) %out)
; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; CI-NEXT: v_or_b32_e32 v0, v0, v1
@@ -1323,9 +1343,7 @@ define amdgpu_kernel void @v_test_canonicalize_fabs_var_v2f16(ptr addrspace(1) %
; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; CI-NEXT: v_cvt_f32_f16_e64 v1, |v1|
; CI-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; CI-NEXT: v_or_b32_e32 v0, v0, v1
@@ -1404,9 +1422,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_fabs_var_v2f16(ptr addrspace
; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; CI-NEXT: v_or_b32_e32 v0, v0, v1
@@ -1485,9 +1501,7 @@ define amdgpu_kernel void @v_test_canonicalize_fneg_var_v2f16(ptr addrspace(1) %
; CI-NEXT: v_lshrrev_b32_e32 v1, 16, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
; CI-NEXT: v_or_b32_e32 v0, v0, v1
@@ -1551,9 +1565,7 @@ define amdgpu_kernel void @s_test_canonicalize_var_v2f16(ptr addrspace(1) %out,
; CI-NEXT: v_cvt_f32_f16_e32 v1, s2
; CI-NEXT: s_mov_b32 s3, 0xf000
; CI-NEXT: s_mov_b32 s2, -1
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
; CI-NEXT: v_or_b32_e32 v0, v1, v0
@@ -2424,7 +2436,6 @@ define <2 x half> @v_test_canonicalize_reg_undef_v2f16(half %val) #1 {
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_reg_undef_v2f16:
@@ -2456,8 +2467,7 @@ define <2 x half> @v_test_canonicalize_undef_reg_v2f16(half %val) #1 {
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_mul_f32_e32 v1, 1.0, v0
+; CI-NEXT: v_cvt_f32_f16_e32 v1, v0
; CI-NEXT: v_mov_b32_e32 v0, 0x7fc00000
; CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -2738,7 +2748,6 @@ define <4 x half> @v_test_canonicalize_reg_undef_undef_undef_v4f16(half %val) #1
; CI-NEXT: v_mov_b32_e32 v2, 0x7fc00000
; CI-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_reg_undef_undef_undef_v4f16:
@@ -2782,8 +2791,6 @@ define <4 x half> @v_test_canonicalize_reg_reg_undef_undef_v4f16(half %val0, hal
; CI-NEXT: v_mov_b32_e32 v3, 0x7fc00000
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; CI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_reg_reg_undef_undef_v4f16:
@@ -2826,13 +2833,10 @@ define <4 x half> @v_test_canonicalize_reg_undef_reg_reg_v4f16(half %val0, half
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
-; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT: v_cvt_f16_f32_e32 v3, v2
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; CI-NEXT: v_cvt_f32_f16_e32 v3, v2
-; CI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; CI-NEXT: v_mul_f32_e32 v2, 1.0, v1
-; CI-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; CI-NEXT: v_cvt_f32_f16_e32 v2, v1
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
; CI-NEXT: v_mov_b32_e32 v1, 0x7fc00000
; CI-NEXT: s_setpc_b64 s[30:31]
;
@@ -2878,18 +2882,18 @@ define <6 x half> @v_test_canonicalize_var_v6f16(<6 x half> %val) #1 {
; CI-LABEL: v_test_canonicalize_var_v6f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_var_v6f16:
@@ -2933,22 +2937,22 @@ define <8 x half> @v_test_canonicalize_var_v8f16(<8 x half> %val) #1 {
; CI-LABEL: v_test_canonicalize_var_v8f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_var_v8f16:
@@ -3001,30 +3005,30 @@ define <12 x half> @v_test_canonicalize_var_v12f16(<12 x half> %val) #1 {
; CI-LABEL: v_test_canonicalize_var_v12f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
-; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
-; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
+; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
-; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_var_v12f16:
@@ -3087,38 +3091,38 @@ define <16 x half> @v_test_canonicalize_var_v16f16(<16 x half> %val) #1 {
; CI-LABEL: v_test_canonicalize_var_v16f16:
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
-; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
-; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
-; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
-; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
-; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
+; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
+; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
+; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
+; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
-; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
-; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
-; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
-; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
-; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
; CI-NEXT: s_setpc_b64 s[30:31]
;
; GFX11-LABEL: v_test_canonicalize_var_v16f16:
@@ -3216,68 +3220,68 @@ define <32 x half> @v_test_canonicalize_var_v32f16(<32 x half> %val) #1 {
; CI: ; %bb.0:
; CI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32
+; CI-NEXT: v_cvt_f16_f32_e32 v30, v30
+; CI-NEXT: v_cvt_f16_f32_e32 v29, v29
+; CI-NEXT: v_cvt_f16_f32_e32 v28, v28
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT: v_cvt_f16_f32_e32 v20, v20
+; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
+; CI-NEXT: v_cvt_f16_f32_e32 v18, v18
+; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
; CI-NEXT: v_cvt_f16_f32_e32 v0, v0
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
-; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
-; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
-; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
-; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
-; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
-; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
-; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
-; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
-; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
-; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
-; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
-; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
-; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
-; CI-NEXT: v_cvt_f16_f32_e32 v18, v18
-; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
-; CI-NEXT: v_cvt_f16_f32_e32 v20, v20
-; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
-; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
-; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
-; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
-; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
-; CI-NEXT: v_cvt_f16_f32_e32 v28, v28
-; CI-NEXT: v_cvt_f16_f32_e32 v29, v29
-; CI-NEXT: v_cvt_f16_f32_e32 v30, v30
+; CI-NEXT: v_cvt_f32_f16_e32 v30, v30
+; CI-NEXT: v_cvt_f32_f16_e32 v29, v29
+; CI-NEXT: v_cvt_f32_f16_e32 v28, v28
+; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT: v_cvt_f32_f16_e32 v26, v26
+; CI-NEXT: v_cvt_f32_f16_e32 v25, v25
+; CI-NEXT: v_cvt_f32_f16_e32 v24, v24
+; CI-NEXT: v_cvt_f32_f16_e32 v23, v23
+; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; CI-NEXT: v_cvt_f32_f16_e32 v21, v21
+; CI-NEXT: v_cvt_f32_f16_e32 v20, v20
+; CI-NEXT: v_cvt_f32_f16_e32 v19, v19
+; CI-NEXT: v_cvt_f32_f16_e32 v18, v18
+; CI-NEXT: v_cvt_f32_f16_e32 v17, v17
+; CI-NEXT: v_cvt_f32_f16_e32 v16, v16
+; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
+; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
+; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
+; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
+; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
; CI-NEXT: v_cvt_f32_f16_e32 v0, v0
; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
-; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
-; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
-; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
-; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
-; CI-NEXT: v_cvt_f32_f16_e32 v10, v10
-; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
-; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
-; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
-; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
-; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
-; CI-NEXT: v_cvt_f32_f16_e32 v16, v16
-; CI-NEXT: v_cvt_f32_f16_e32 v17, v17
-; CI-NEXT: v_cvt_f32_f16_e32 v18, v18
-; CI-NEXT: v_cvt_f32_f16_e32 v19, v19
-; CI-NEXT: v_cvt_f32_f16_e32 v20, v20
-; CI-NEXT: v_cvt_f32_f16_e32 v21, v21
-; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
-; CI-NEXT: v_cvt_f32_f16_e32 v23, v23
-; CI-NEXT: v_cvt_f32_f16_e32 v24, v24
-; CI-NEXT: v_cvt_f32_f16_e32 v25, v25
-; CI-NEXT: v_cvt_f32_f16_e32 v26, v26
-; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
-; CI-NEXT: v_cvt_f32_f16_e32 v28, v28
-; CI-NEXT: v_cvt_f32_f16_e32 v29, v29
-; CI-NEXT: v_cvt_f32_f16_e32 v30, v30
; CI-NEXT: s_waitcnt vmcnt(0)
; CI-NEXT: v_cvt_f16_f32_e32 v31, v31
; CI-NEXT: v_cvt_f32_f16_e32 v31, v31
@@ -3456,228 +3460,354 @@ define <64 x half> @v_test_canonicalize_var_v64f16(<64 x half> %val) #1 {
; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:104
+; CI-NEXT: buffer_load_dword v32, off, s[0:3], s32 offset:108
+; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT: v_cvt_f32_f16_e32 v1, v1
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; CI-NEXT: v_or_b32_e32 v1, v1, v2
; CI-NEXT: v_cvt_f16_f32_e32 v2, v4
; CI-NEXT: v_cvt_f16_f32_e32 v4, v5
; CI-NEXT: v_cvt_f16_f32_e32 v5, v7
; CI-NEXT: v_cvt_f16_f32_e32 v7, v9
+; CI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; CI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT: v_cvt_f16_f32_e32 v5, v5
+; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
; CI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; CI-NEXT: v_or_b32_e32 v2, v3, v2
; CI-NEXT: v_cvt_f16_f32_e32 v3, v6
; CI-NEXT: v_cvt_f16_f32_e32 v6, v10
; CI-NEXT: v_cvt_f16_f32_e32 v9, v13
-; CI-NEXT: v_cvt_f16_f32_e32 v10, v18
+; CI-NEXT: v_cvt_f16_f32_e32 v10, v16
+; CI-NEXT: v_cvt_f32_f16_e32 v3, v3
+; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT: v_cvt_f16_f32_e32 v9, v9
+; CI-NEXT: v_cvt_f32_f16_e32 v13, v13
; CI-NEXT: v_lshlrev_b32_e32 v3, 16, v3
; CI-NEXT: v_or_b32_e32 v3, v4, v3
; CI-NEXT: v_cvt_f16_f32_e32 v4, v8
; CI-NEXT: v_cvt_f16_f32_e32 v8, v14
-; CI-NEXT: v_cvt_f16_f32_e32 v13, v21
-; CI-NEXT: v_cvt_f16_f32_e32 v14, v26
+; CI-NEXT: buffer_load_dword v14, off, s[0:3], s32
+; CI-NEXT: buffer_load_dword v33, off, s[0:3], s32 offset:4
+; CI-NEXT: v_cvt_f16_f32_e32 v17, v23
+; CI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; CI-NEXT: v_cvt_f32_f16_e32 v8, v8
+; CI-NEXT: v_cvt_f32_f16_e32 v17, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v4, v4
+; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
; CI-NEXT: v_lshlrev_b32_e32 v4, 16, v4
; CI-NEXT: v_or_b32_e32 v4, v5, v4
; CI-NEXT: v_lshlrev_b32_e32 v5, 16, v6
; CI-NEXT: v_cvt_f16_f32_e32 v6, v12
; CI-NEXT: v_or_b32_e32 v5, v7, v5
; CI-NEXT: v_cvt_f16_f32_e32 v7, v11
-; CI-NEXT: v_cvt_f16_f32_e32 v11, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v15
+; CI-NEXT: v_cvt_f32_f16_e32 v6, v6
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v21
+; CI-NEXT: v_cvt_f32_f16_e32 v7, v7
+; CI-NEXT: v_cvt_f16_f32_e32 v6, v6
+; CI-NEXT: v_cvt_f16_f32_e32 v7, v7
; CI-NEXT: v_lshlrev_b32_e32 v6, 16, v6
-; CI-NEXT: v_cvt_f16_f32_e32 v12, v22
; CI-NEXT: v_or_b32_e32 v6, v7, v6
; CI-NEXT: v_lshlrev_b32_e32 v7, 16, v8
-; CI-NEXT: v_cvt_f16_f32_e32 v8, v16
+; CI-NEXT: v_cvt_f16_f32_e32 v8, v19
; CI-NEXT: v_or_b32_e32 v7, v9, v7
-; CI-NEXT: v_cvt_f16_f32_e32 v9, v15
-; CI-NEXT: v_cvt_f16_f32_e32 v15, v25
+; CI-NEXT: v_cvt_f16_f32_e32 v9, v20
+; CI-NEXT: v_cvt_f32_f16_e32 v12, v8
+; CI-NEXT: v_cvt_f32_f16_e32 v8, v10
+; CI-NEXT: v_cvt_f32_f16_e32 v10, v11
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v18
+; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:124
+; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:112
+; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:116
+; CI-NEXT: v_cvt_f16_f32_e32 v8, v8
+; CI-NEXT: v_cvt_f16_f32_e32 v10, v10
+; CI-NEXT: v_cvt_f32_f16_e32 v11, v11
+; CI-NEXT: v_cvt_f32_f16_e32 v9, v9
; CI-NEXT: v_lshlrev_b32_e32 v8, 16, v8
-; CI-NEXT: v_cvt_f16_f32_e32 v25, v29
-; CI-NEXT: v_or_b32_e32 v8, v9, v8
+; CI-NEXT: v_or_b32_e32 v8, v10, v8
+; CI-NEXT: v_cvt_f16_f32_e32 v10, v11
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v13
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v9
+; CI-NEXT: v_cvt_f16_f32_e32 v12, v12
; CI-NEXT: v_lshlrev_b32_e32 v9, 16, v10
-; CI-NEXT: v_cvt_f16_f32_e32 v10, v20
; CI-NEXT: v_or_b32_e32 v9, v11, v9
-; CI-NEXT: v_cvt_f16_f32_e32 v11, v19
-; CI-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:4
-; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32
-; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:12
-; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:8
-; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v10
-; CI-NEXT: v_or_b32_e32 v10, v11, v10
-; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v12
-; CI-NEXT: v_cvt_f16_f32_e32 v12, v24
+; CI-NEXT: v_lshlrev_b32_e32 v10, 16, v13
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v25
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v22
+; CI-NEXT: v_or_b32_e32 v10, v12, v10
+; CI-NEXT: v_cvt_f16_f32_e32 v12, v26
+; CI-NEXT: v_cvt_f32_f16_e32 v16, v11
+; CI-NEXT: v_cvt_f32_f16_e32 v11, v13
+; CI-NEXT: v_cvt_f32_f16_e32 v13, v15
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v24
+; CI-NEXT: v_cvt_f32_f16_e32 v12, v12
+; CI-NEXT: v_cvt_f16_f32_e32 v11, v11
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v13
+; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v30
+; CI-NEXT: v_lshlrev_b32_e32 v11, 16, v11
; CI-NEXT: v_or_b32_e32 v11, v13, v11
-; CI-NEXT: v_cvt_f16_f32_e32 v13, v23
-; CI-NEXT: buffer_load_dword v20, off, s[0:3], s32 offset:20
-; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:16
-; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:28
-; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:24
-; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v12
-; CI-NEXT: v_cvt_f16_f32_e32 v24, v30
-; CI-NEXT: v_or_b32_e32 v12, v13, v12
-; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v14
-; CI-NEXT: v_or_b32_e32 v13, v15, v13
-; CI-NEXT: v_cvt_f16_f32_e32 v14, v28
+; CI-NEXT: v_cvt_f16_f32_e32 v13, v15
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v17, v12
+; CI-NEXT: v_cvt_f16_f32_e32 v25, v29
+; CI-NEXT: v_lshlrev_b32_e32 v12, 16, v13
+; CI-NEXT: v_or_b32_e32 v12, v15, v12
+; CI-NEXT: s_waitcnt vmcnt(6)
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v31
+; CI-NEXT: v_lshlrev_b32_e32 v13, 16, v17
+; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:128
+; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132
+; CI-NEXT: buffer_load_dword v34, off, s[0:3], s32 offset:120
+; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; CI-NEXT: v_cvt_f32_f16_e32 v23, v15
; CI-NEXT: v_cvt_f16_f32_e32 v15, v27
-; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:36
-; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:32
-; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:44
-; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:40
+; CI-NEXT: v_cvt_f32_f16_e32 v25, v25
+; CI-NEXT: s_waitcnt vmcnt(7)
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT: s_waitcnt vmcnt(6)
+; CI-NEXT: v_cvt_f16_f32_e32 v21, v33
+; CI-NEXT: v_cvt_f32_f16_e32 v15, v15
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT: v_cvt_f32_f16_e32 v24, v14
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v28
+; CI-NEXT: v_cvt_f16_f32_e32 v15, v15
+; CI-NEXT: v_cvt_f32_f16_e32 v21, v21
+; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT: v_cvt_f32_f16_e32 v14, v14
+; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
+; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT: v_cvt_f16_f32_e32 v28, v23
+; CI-NEXT: v_cvt_f16_f32_e32 v14, v14
+; CI-NEXT: v_or_b32_e32 v13, v16, v13
+; CI-NEXT: v_cvt_f16_f32_e32 v16, v32
+; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:12
; CI-NEXT: v_lshlrev_b32_e32 v14, 16, v14
; CI-NEXT: v_or_b32_e32 v14, v15, v14
-; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v24
+; CI-NEXT: v_lshlrev_b32_e32 v15, 16, v22
; CI-NEXT: v_or_b32_e32 v15, v25, v15
-; CI-NEXT: s_waitcnt vmcnt(11)
-; CI-NEXT: v_cvt_f16_f32_e32 v16, v16
-; CI-NEXT: s_waitcnt vmcnt(10)
-; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v25, v21
+; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:96
+; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:100
+; CI-NEXT: v_cvt_f32_f16_e32 v16, v16
+; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:64
+; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v16
+; CI-NEXT: v_or_b32_e32 v16, v24, v25
+; CI-NEXT: v_lshlrev_b32_e32 v24, 16, v27
+; CI-NEXT: v_or_b32_e32 v25, v28, v24
; CI-NEXT: s_waitcnt vmcnt(9)
; CI-NEXT: v_cvt_f16_f32_e32 v18, v18
; CI-NEXT: s_waitcnt vmcnt(8)
; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
-; CI-NEXT: v_lshlrev_b32_e32 v16, 16, v16
-; CI-NEXT: v_or_b32_e32 v16, v17, v16
-; CI-NEXT: v_lshlrev_b32_e32 v17, 16, v18
-; CI-NEXT: v_or_b32_e32 v17, v19, v17
; CI-NEXT: s_waitcnt vmcnt(7)
-; CI-NEXT: v_cvt_f16_f32_e32 v18, v20
+; CI-NEXT: v_cvt_f16_f32_e32 v20, v20
+; CI-NEXT: v_cvt_f32_f16_e32 v18, v18
+; CI-NEXT: v_cvt_f32_f16_e32 v19, v19
+; CI-NEXT: v_cvt_f32_f16_e32 v20, v20
+; CI-NEXT: v_cvt_f16_f32_e32 v18, v18
+; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
+; CI-NEXT: v_cvt_f16_f32_e32 v20, v20
+; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
+; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
+; CI-NEXT: v_or_b32_e32 v20, v19, v20
+; CI-NEXT: buffer_load_dword v19, off, s[0:3], s32 offset:20
+; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:8
+; CI-NEXT: s_waitcnt vmcnt(8)
+; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT: s_waitcnt vmcnt(7)
+; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
; CI-NEXT: s_waitcnt vmcnt(6)
-; CI-NEXT: v_cvt_f16_f32_e32 v19, v21
-; CI-NEXT: s_waitcnt vmcnt(5)
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v34
+; CI-NEXT: v_cvt_f32_f16_e32 v17, v17
+; CI-NEXT: v_cvt_f32_f16_e32 v26, v26
+; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
+; CI-NEXT: v_or_b32_e32 v17, v17, v26
+; CI-NEXT: v_add_i32_e32 v26, vcc, 0x7c, v0
+; CI-NEXT: v_or_b32_e32 v18, v27, v18
+; CI-NEXT: buffer_store_dword v17, v26, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x78, v0
+; CI-NEXT: buffer_store_dword v18, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x74, v0
+; CI-NEXT: buffer_store_dword v20, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x70, v0
+; CI-NEXT: buffer_store_dword v25, v17, s[0:3], 0 offen
+; CI-NEXT: s_waitcnt vmcnt(8)
+; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT: s_waitcnt vmcnt(7)
; CI-NEXT: v_cvt_f16_f32_e32 v20, v22
-; CI-NEXT: s_waitcnt vmcnt(4)
-; CI-NEXT: v_cvt_f16_f32_e32 v21, v23
-; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
-; CI-NEXT: v_or_b32_e32 v18, v19, v18
-; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v20
-; CI-NEXT: v_or_b32_e32 v19, v21, v19
-; CI-NEXT: s_waitcnt vmcnt(3)
-; CI-NEXT: v_cvt_f16_f32_e32 v20, v26
-; CI-NEXT: s_waitcnt vmcnt(2)
-; CI-NEXT: v_cvt_f16_f32_e32 v21, v27
-; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v28
-; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v27, v29
+; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:88
+; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:92
+; CI-NEXT: buffer_load_dword v17, off, s[0:3], s32 offset:80
+; CI-NEXT: buffer_load_dword v18, off, s[0:3], s32 offset:84
+; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:72
+; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:76
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v23
+; CI-NEXT: v_cvt_f32_f16_e32 v21, v21
+; CI-NEXT: v_cvt_f32_f16_e32 v20, v20
+; CI-NEXT: s_waitcnt vmcnt(12)
+; CI-NEXT: v_cvt_f16_f32_e32 v29, v29
+; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT: v_cvt_f16_f32_e32 v20, v20
+; CI-NEXT: v_cvt_f32_f16_e32 v29, v29
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v20
; CI-NEXT: v_or_b32_e32 v20, v21, v20
-; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v26
-; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:52
-; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:48
-; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:60
-; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:56
-; CI-NEXT: v_or_b32_e32 v21, v27, v21
-; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:132
-; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:128
-; CI-NEXT: s_waitcnt vmcnt(5)
-; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
-; CI-NEXT: s_waitcnt vmcnt(4)
-; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
-; CI-NEXT: s_waitcnt vmcnt(3)
+; CI-NEXT: v_add_i32_e32 v21, vcc, 0x6c, v0
+; CI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen
+; CI-NEXT: v_lshlrev_b32_e32 v20, 16, v22
+; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:24
+; CI-NEXT: v_cvt_f16_f32_e32 v29, v29
+; CI-NEXT: s_waitcnt vmcnt(13)
+; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
+; CI-NEXT: s_waitcnt vmcnt(12)
+; CI-NEXT: v_cvt_f16_f32_e32 v23, v24
+; CI-NEXT: buffer_load_dword v21, off, s[0:3], s32 offset:28
+; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:16
+; CI-NEXT: v_cvt_f32_f16_e32 v19, v19
+; CI-NEXT: v_cvt_f32_f16_e32 v23, v23
+; CI-NEXT: v_cvt_f16_f32_e32 v19, v19
; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
-; CI-NEXT: s_waitcnt vmcnt(2)
-; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
-; CI-NEXT: s_waitcnt vmcnt(1)
+; CI-NEXT: v_lshlrev_b32_e32 v19, 16, v19
+; CI-NEXT: v_or_b32_e32 v20, v23, v20
+; CI-NEXT: s_waitcnt vmcnt(9)
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: s_waitcnt vmcnt(8)
+; CI-NEXT: v_cvt_f16_f32_e32 v23, v28
+; CI-NEXT: s_waitcnt vmcnt(7)
+; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT: s_waitcnt vmcnt(6)
+; CI-NEXT: v_cvt_f16_f32_e32 v18, v18
+; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT: v_cvt_f32_f16_e32 v23, v23
+; CI-NEXT: s_waitcnt vmcnt(4)
; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
-; CI-NEXT: v_lshlrev_b32_e32 v24, 16, v24
-; CI-NEXT: v_or_b32_e32 v24, v25, v24
-; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; CI-NEXT: v_or_b32_e32 v26, v27, v26
-; CI-NEXT: v_add_i32_e32 v27, vcc, 0x7c, v0
-; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen
-; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:124
-; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:120
+; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
+; CI-NEXT: v_cvt_f32_f16_e32 v18, v18
+; CI-NEXT: v_cvt_f32_f16_e32 v26, v26
+; CI-NEXT: v_cvt_f32_f16_e32 v17, v17
; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; CI-NEXT: v_or_b32_e32 v22, v22, v23
-; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:88
-; CI-NEXT: s_waitcnt vmcnt(2)
+; CI-NEXT: v_or_b32_e32 v23, v27, v23
+; CI-NEXT: v_add_i32_e32 v27, vcc, 0x68, v0
+; CI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen
+; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:32
+; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:36
+; CI-NEXT: v_cvt_f32_f16_e32 v25, v25
+; CI-NEXT: v_cvt_f16_f32_e32 v18, v18
; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: v_cvt_f16_f32_e32 v17, v17
+; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT: v_lshlrev_b32_e32 v18, 16, v18
; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; CI-NEXT: v_or_b32_e32 v26, v27, v26
-; CI-NEXT: v_add_i32_e32 v27, vcc, 0x78, v0
-; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen
-; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:116
-; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:112
-; CI-NEXT: s_waitcnt vmcnt(3)
+; CI-NEXT: v_or_b32_e32 v17, v17, v18
+; CI-NEXT: v_add_i32_e32 v18, vcc, 0x64, v0
+; CI-NEXT: v_or_b32_e32 v25, v25, v26
+; CI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x60, v0
+; CI-NEXT: buffer_store_dword v25, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x5c, v0
+; CI-NEXT: s_waitcnt vmcnt(5)
+; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT: v_cvt_f32_f16_e32 v24, v24
+; CI-NEXT: v_cvt_f32_f16_e32 v21, v21
+; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT: v_cvt_f16_f32_e32 v21, v21
+; CI-NEXT: v_or_b32_e32 v19, v24, v19
+; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:44
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT: v_lshlrev_b32_e32 v21, 16, v21
+; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT: v_or_b32_e32 v21, v22, v21
+; CI-NEXT: buffer_load_dword v22, off, s[0:3], s32 offset:40
+; CI-NEXT: s_waitcnt vmcnt(5)
+; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
+; CI-NEXT: s_waitcnt vmcnt(4)
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: v_cvt_f32_f16_e32 v23, v23
+; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
+; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT: v_cvt_f32_f16_e32 v24, v24
+; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
; CI-NEXT: s_waitcnt vmcnt(0)
+; CI-NEXT: v_cvt_f16_f32_e32 v22, v22
+; CI-NEXT: v_cvt_f32_f16_e32 v22, v22
+; CI-NEXT: v_cvt_f16_f32_e32 v28, v22
+; CI-NEXT: v_or_b32_e32 v22, v23, v27
+; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:52
+; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v24
+; CI-NEXT: v_or_b32_e32 v23, v28, v23
+; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:56
+; CI-NEXT: buffer_load_dword v24, off, s[0:3], s32 offset:48
+; CI-NEXT: s_waitcnt vmcnt(2)
; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
-; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; CI-NEXT: v_or_b32_e32 v26, v27, v26
-; CI-NEXT: v_add_i32_e32 v27, vcc, 0x74, v0
-; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen
-; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:108
-; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104
; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_cvt_f16_f32_e32 v25, v26
+; CI-NEXT: v_cvt_f16_f32_e32 v28, v28
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v27
-; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:92
-; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; CI-NEXT: v_or_b32_e32 v25, v26, v25
-; CI-NEXT: v_add_i32_e32 v26, vcc, 0x70, v0
-; CI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen
-; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:100
-; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96
-; CI-NEXT: s_waitcnt vmcnt(3)
+; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
+; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT: v_cvt_f32_f16_e32 v28, v28
+; CI-NEXT: v_cvt_f32_f16_e32 v24, v24
; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: v_cvt_f16_f32_e32 v28, v28
+; CI-NEXT: v_cvt_f16_f32_e32 v24, v24
; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
-; CI-NEXT: v_or_b32_e32 v23, v23, v27
-; CI-NEXT: s_waitcnt vmcnt(1)
-; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
+; CI-NEXT: v_or_b32_e32 v24, v24, v27
+; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:60
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT: v_add_i32_e32 v27, vcc, 0x68, v0
-; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; CI-NEXT: v_or_b32_e32 v25, v26, v25
-; CI-NEXT: v_add_i32_e32 v26, vcc, 0x6c, v0
-; CI-NEXT: buffer_store_dword v25, v26, s[0:3], 0 offen
-; CI-NEXT: buffer_load_dword v25, off, s[0:3], s32 offset:68
-; CI-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:64
-; CI-NEXT: buffer_store_dword v23, v27, s[0:3], 0 offen
-; CI-NEXT: buffer_load_dword v23, off, s[0:3], s32 offset:76
-; CI-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:72
-; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:84
-; CI-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:80
-; CI-NEXT: s_waitcnt vmcnt(3)
-; CI-NEXT: v_cvt_f16_f32_e32 v23, v23
-; CI-NEXT: v_cvt_f16_f32_e32 v25, v25
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v26
-; CI-NEXT: v_lshlrev_b32_e32 v23, 16, v23
-; CI-NEXT: v_lshlrev_b32_e32 v25, 16, v25
-; CI-NEXT: v_or_b32_e32 v25, v26, v25
-; CI-NEXT: s_waitcnt vmcnt(2)
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v27
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: v_cvt_f32_f16_e32 v27, v27
+; CI-NEXT: v_cvt_f16_f32_e32 v27, v27
+; CI-NEXT: v_lshlrev_b32_e32 v27, 16, v27
+; CI-NEXT: v_or_b32_e32 v27, v28, v27
+; CI-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:68
; CI-NEXT: s_waitcnt vmcnt(0)
-; CI-NEXT: v_cvt_f16_f32_e32 v27, v29
-; CI-NEXT: v_or_b32_e32 v23, v26, v23
-; CI-NEXT: v_cvt_f16_f32_e32 v26, v28
-; CI-NEXT: v_lshlrev_b32_e32 v26, 16, v26
-; CI-NEXT: v_or_b32_e32 v26, v27, v26
-; CI-NEXT: v_add_i32_e32 v27, vcc, 0x64, v0
-; CI-NEXT: buffer_store_dword v26, v27, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v26, vcc, 0x60, v0
-; CI-NEXT: buffer_store_dword v23, v26, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v23, vcc, 0x5c, v0
-; CI-NEXT: buffer_store_dword v25, v23, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v23, vcc, 0x58, v0
-; CI-NEXT: buffer_store_dword v22, v23, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v22, vcc, 0x54, v0
-; CI-NEXT: buffer_store_dword v24, v22, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v22, vcc, 0x50, v0
-; CI-NEXT: buffer_store_dword v21, v22, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v21, vcc, 0x4c, v0
-; CI-NEXT: buffer_store_dword v20, v21, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v20, vcc, 0x48, v0
-; CI-NEXT: buffer_store_dword v19, v20, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v19, vcc, 0x44, v0
-; CI-NEXT: buffer_store_dword v18, v19, s[0:3], 0 offen
-; CI-NEXT: v_add_i32_e32 v18, vcc, 64, v0
-; CI-NEXT: buffer_store_dword v17, v18, s[0:3], 0 offen
+; CI-NEXT: v_cvt_f16_f32_e32 v28, v28
+; CI-NEXT: v_cvt_f32_f16_e32 v28, v28
+; CI-NEXT: v_cvt_f16_f32_e32 v28, v28
+; CI-NEXT: v_lshlrev_b32_e32 v28, 16, v28
+; CI-NEXT: v_or_b32_e32 v28, v29, v28
+; CI-NEXT: buffer_store_dword v28, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x58, v0
+; CI-NEXT: buffer_store_dword v27, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x54, v0
+; CI-NEXT: buffer_store_dword v24, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x50, v0
+; CI-NEXT: buffer_store_dword v23, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x4c, v0
+; CI-NEXT: buffer_store_dword v22, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x48, v0
+; CI-NEXT: buffer_store_dword v21, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 0x44, v0
+; CI-NEXT: buffer_store_dword v19, v17, s[0:3], 0 offen
+; CI-NEXT: v_add_i32_e32 v17, vcc, 64, v0
+; CI-NEXT: buffer_store_dword v20, v17, s[0:3], 0 offen
; CI-NEXT: v_add_i32_e32 v17, vcc, 60, v0
; CI-NEXT: buffer_store_dword v16, v17, s[0:3], 0 offen
; CI-NEXT: v_add_i32_e32 v16, vcc, 56, v0
diff --git a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
index c1093a1..d53c041 100644
--- a/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
+++ b/llvm/test/CodeGen/AMDGPU/fcanonicalize.ll
@@ -2389,7 +2389,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_flush(ptr addrspace(1) %a
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v0
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -2471,15 +2470,13 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_flush(ptr addrspace(1)
; GFX6-NEXT: flat_load_dword v0, v[0:1]
; GFX6-NEXT: v_mov_b32_e32 v3, s3
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_or_b32_e32 v4, v1, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v4, v0, v1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX6-NEXT: flat_store_dword v[0:1], v4
@@ -2724,7 +2721,6 @@ define amdgpu_kernel void @test_canonicalize_value_f16_denorm(ptr addrspace(1) %
; GFX6-NEXT: v_mov_b32_e32 v1, s3
; GFX6-NEXT: s_waitcnt vmcnt(0)
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v0
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
@@ -2807,15 +2803,13 @@ define amdgpu_kernel void @test_canonicalize_value_v2f16_denorm(ptr addrspace(1)
; GFX6-NEXT: flat_load_dword v0, v[0:1]
; GFX6-NEXT: v_mov_b32_e32 v3, s3
; GFX6-NEXT: s_waitcnt vmcnt(0)
-; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v0
-; GFX6-NEXT: v_lshrrev_b32_e32 v0, 16, v0
+; GFX6-NEXT: v_lshrrev_b32_e32 v1, 16, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v1, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
-; GFX6-NEXT: v_mul_f32_e32 v1, 1.0, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
-; GFX6-NEXT: v_mul_f32_e32 v0, 1.0, v0
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: v_lshlrev_b32_e32 v0, 16, v0
-; GFX6-NEXT: v_or_b32_e32 v4, v1, v0
+; GFX6-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; GFX6-NEXT: v_or_b32_e32 v4, v0, v1
; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v2
; GFX6-NEXT: v_addc_u32_e32 v1, vcc, 0, v3, vcc
; GFX6-NEXT: flat_store_dword v[0:1], v4
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index 78fb89c..b32630a 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -951,8 +951,6 @@ define half @v_fneg_minnum_f16_ieee(half %a, half %b) #0 {
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1056,7 +1054,6 @@ define half @v_fneg_posk_minnum_f16_ieee(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, -4.0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1110,7 +1107,6 @@ define half @v_fneg_negk_minnum_f16_ieee(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 4.0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1193,7 +1189,6 @@ define half @v_fneg_neg0_minnum_f16_ieee(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1222,7 +1217,6 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1253,7 +1247,6 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1311,7 +1304,6 @@ define half @v_fneg_0_minnum_foldable_use_f16_ieee(half %a, half %b) #0 {
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_min_f32_e32 v0, 0, v0
; SI-NEXT: v_mul_f32_e64 v0, -v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -1346,7 +1338,6 @@ define half @v_fneg_inv2pi_minnum_foldable_use_f16(half %a, half %b) #0 {
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0
; SI-NEXT: v_mul_f32_e32 v0, v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -1413,8 +1404,6 @@ define { half, half } @v_fneg_minnum_multi_use_minnum_f16_ieee(half %a, half %b)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1
; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, v0, v1
; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -1494,8 +1483,6 @@ define half @v_fneg_maxnum_f16_ieee(half %a, half %b) #0 {
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_min_f32_e32 v0, v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1599,7 +1586,6 @@ define half @v_fneg_posk_maxnum_f16_ieee(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_min_f32_e32 v0, -4.0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1653,7 +1639,6 @@ define half @v_fneg_negk_maxnum_f16_ieee(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_min_f32_e32 v0, 4.0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1736,7 +1721,6 @@ define half @v_fneg_neg0_maxnum_f16_ieee(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_min_f32_e32 v0, 0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1792,7 +1776,6 @@ define half @v_fneg_0_maxnum_foldable_use_f16_ieee(half %a, half %b) #0 {
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 0, v0
; SI-NEXT: v_mul_f32_e64 v0, -v0, v1
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -1859,8 +1842,6 @@ define { half, half } @v_fneg_maxnum_multi_use_maxnum_f16_ieee(half %a, half %b)
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e64 v1, -v1
; SI-NEXT: v_cvt_f32_f16_e64 v0, -v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_min_f32_e32 v0, v0, v1
; SI-NEXT: v_mul_f32_e32 v1, -4.0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
@@ -3980,7 +3961,8 @@ define half @v_fneg_canonicalize_f16(half %a) #0 {
; SI-LABEL: v_fneg_canonicalize_f16:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SI-NEXT: v_xor_b32_e32 v0, 0x80000000, v0
+; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
; VI-LABEL: v_fneg_canonicalize_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
index 17f6761..b5440b9 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.new.ll
@@ -1021,7 +1021,6 @@ define half @v_fneg_inv2pi_minnum_f16(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 0xbe230000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1043,7 +1042,6 @@ define half @v_fneg_neg_inv2pi_minnum_f16(half %a) #0 {
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SI-NEXT: v_cvt_f16_f32_e64 v0, -v0
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 0x3e230000, v0
; SI-NEXT: s_setpc_b64 s[30:31]
;
diff --git a/llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir b/llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir
index 3616d61..5ef8a94 100644
--- a/llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir
+++ b/llvm/test/CodeGen/AMDGPU/fold-restore-undef-use.mir
@@ -8,6 +8,8 @@
---
name: restore_undef_copy_use
tracksRegLiveness: true
+frameInfo:
+ adjustsStack: true
machineFunctionInfo:
maxKernArgAlign: 1
isEntryFunction: true
diff --git a/llvm/test/CodeGen/AMDGPU/fp-classify.ll b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
index 6fa7df9..18d2e52 100644
--- a/llvm/test/CodeGen/AMDGPU/fp-classify.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp-classify.ll
@@ -618,16 +618,16 @@ define amdgpu_kernel void @test_not_isfinite_pattern_4_wrong_ord_test(ptr addrsp
define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %out, half %x) #0 {
; SI-LABEL: test_isinf_pattern_f16:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dword s0, s[0:1], 0xb
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_mov_b32 s1, 0x7f800000
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e64 v0, |s0|
-; SI-NEXT: v_cmp_eq_f32_e32 vcc, s1, v0
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: s_and_b32 s4, s4, 0x7fff
+; SI-NEXT: s_cmpk_eq_i32 s4, 0x7c00
+; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_isinf_pattern_f16:
@@ -667,16 +667,19 @@ define amdgpu_kernel void @test_isinf_pattern_f16(ptr addrspace(1) nocapture %ou
define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocapture %out, half %x) #0 {
; SI-LABEL: test_isfinite_pattern_0_f16:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dword s0, s[0:1], 0xb
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_movk_i32 s1, 0x1f8
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
-; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, s1
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT: s_and_b32 s4, s4, 0x7fff
+; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; SI-NEXT: s_cmpk_lg_i32 s4, 0x7c00
+; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_isfinite_pattern_0_f16:
@@ -718,16 +721,19 @@ define amdgpu_kernel void @test_isfinite_pattern_0_f16(ptr addrspace(1) nocaptur
define amdgpu_kernel void @test_isfinite_pattern_4_f16(ptr addrspace(1) nocapture %out, half %x) #0 {
; SI-LABEL: test_isfinite_pattern_4_f16:
; SI: ; %bb.0:
-; SI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9
-; SI-NEXT: s_load_dword s0, s[0:1], 0xb
-; SI-NEXT: s_mov_b32 s7, 0xf000
-; SI-NEXT: s_mov_b32 s6, -1
-; SI-NEXT: s_movk_i32 s1, 0x1f8
+; SI-NEXT: s_load_dword s4, s[0:1], 0xb
+; SI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; SI-NEXT: s_mov_b32 s3, 0xf000
+; SI-NEXT: s_mov_b32 s2, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s0
-; SI-NEXT: v_cmp_class_f32_e64 s[0:1], v0, s1
-; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[0:1]
-; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
+; SI-NEXT: s_and_b32 s4, s4, 0x7fff
+; SI-NEXT: v_cmp_o_f32_e32 vcc, v0, v0
+; SI-NEXT: s_cmpk_lt_i32 s4, 0x7c00
+; SI-NEXT: s_cselect_b64 s[4:5], -1, 0
+; SI-NEXT: s_and_b64 s[4:5], vcc, s[4:5]
+; SI-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
; VI-LABEL: test_isfinite_pattern_4_f16:
diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
index 767d347..a948fab 100644
--- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll
@@ -1181,18 +1181,28 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %ptr) #1 {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat:
; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_mov_b64 s[2:3], exec
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_cbranch_execz .LBB42_3
+; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: .LBB42_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB42_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
+; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
@@ -1200,20 +1210,30 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat(ptr addrspace(1) %pt
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: s_cbranch_execnz .LBB42_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_cbranch_execnz .LBB42_2
+; GFX90A-NEXT: .LBB42_3:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat:
; GFX940: ; %bb.0: ; %main_body
+; GFX940-NEXT: s_mov_b64 s[2:3], exec
+; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT: s_cbranch_execz .LBB42_2
+; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; GFX940-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
+; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: .LBB42_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 seq_cst
@@ -1223,26 +1243,45 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent(ptr addrspace(1) %ptr) #1 {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent:
; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_mov_b64 s[2:3], exec
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_cbranch_execz .LBB43_2
+; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: .LBB43_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent:
; GFX940: ; %bb.0: ; %main_body
+; GFX940-NEXT: s_mov_b64 s[2:3], exec
+; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT: s_cbranch_execz .LBB43_2
+; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; GFX940-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
+; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: .LBB43_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
@@ -1252,18 +1291,28 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace(1) %ptr) #1 {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_system:
; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_mov_b64 s[2:3], exec
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_cbranch_execz .LBB44_3
+; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: .LBB44_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB44_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
+; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
; GFX90A-NEXT: buffer_wbl2
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_invl2
; GFX90A-NEXT: buffer_wbinvl1_vol
@@ -1271,20 +1320,30 @@ define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_system(ptr addrspace
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: s_cbranch_execnz .LBB44_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_cbranch_execnz .LBB44_2
+; GFX90A-NEXT: .LBB44_3:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_system:
; GFX940: ; %bb.0: ; %main_body
+; GFX940-NEXT: s_mov_b64 s[2:3], exec
+; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT: s_cbranch_execz .LBB44_2
+; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; GFX940-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
+; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc0 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] sc1
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc0 sc1
+; GFX940-NEXT: .LBB44_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") seq_cst
@@ -1294,26 +1353,45 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_flush(ptr addrspace(1) %ptr) #0 {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush:
; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_mov_b64 s[2:3], exec
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_cbranch_execz .LBB45_2
+; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
+; GFX90A-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; GFX90A-NEXT: v_mov_b32_e32 v2, 0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
+; GFX90A-NEXT: .LBB45_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_flush:
; GFX940: ; %bb.0: ; %main_body
+; GFX940-NEXT: s_mov_b64 s[2:3], exec
+; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT: s_cbranch_execz .LBB45_2
+; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; GFX940-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
+; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: .LBB45_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
@@ -1485,37 +1563,57 @@ main_body:
define amdgpu_kernel void @global_atomic_fadd_f64_noret_pat_agent_safe(ptr addrspace(1) %ptr) {
; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_mov_b64 s[2:3], exec
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_cbranch_execz .LBB52_3
+; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s6
; GFX90A-NEXT: s_mov_b64 s[2:3], 0
-; GFX90A-NEXT: v_mov_b32_e32 v4, 0
+; GFX90A-NEXT: v_mul_f64 v[4:5], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX90A-NEXT: v_mov_b32_e32 v6, 0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], s[4:5], s[4:5] op_sel:[0,1]
-; GFX90A-NEXT: .LBB52_1: ; %atomicrmw.start
+; GFX90A-NEXT: .LBB52_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
-; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], 4.0
-; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX90A-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX90A-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
; GFX90A-NEXT: s_waitcnt vmcnt(0)
; GFX90A-NEXT: buffer_wbinvl1_vol
; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
; GFX90A-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[0:1], v[0:1] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[2:3]
-; GFX90A-NEXT: s_cbranch_execnz .LBB52_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_cbranch_execnz .LBB52_2
+; GFX90A-NEXT: .LBB52_3:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: global_atomic_fadd_f64_noret_pat_agent_safe:
; GFX940: ; %bb.0: ; %main_body
+; GFX940-NEXT: s_mov_b64 s[2:3], exec
+; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT: s_cbranch_execz .LBB52_2
+; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s2
; GFX940-NEXT: v_mov_b32_e32 v2, 0
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
+; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: buffer_wbl2 sc1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1]
; GFX940-NEXT: s_waitcnt vmcnt(0)
; GFX940-NEXT: buffer_inv sc1
+; GFX940-NEXT: .LBB52_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") seq_cst
@@ -2020,23 +2118,42 @@ main_body:
define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat(ptr addrspace(3) %ptr) #1 {
; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat:
; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_mov_b64 s[2:3], exec
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_cbranch_execz .LBB70_2
+; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
+; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: .LBB70_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat:
; GFX940: ; %bb.0: ; %main_body
+; GFX940-NEXT: s_mov_b64 s[2:3], exec
+; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT: s_cbranch_execz .LBB70_2
+; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
+; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
+; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NEXT: ds_add_f64 v2, v[0:1]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: .LBB70_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
@@ -2046,23 +2163,42 @@ main_body:
define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush(ptr addrspace(3) %ptr) #0 {
; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush:
; GFX90A: ; %bb.0: ; %main_body
+; GFX90A-NEXT: s_mov_b64 s[2:3], exec
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_cbranch_execz .LBB71_2
+; GFX90A-NEXT: ; %bb.1:
; GFX90A-NEXT: s_load_dword s0, s[0:1], 0x24
-; GFX90A-NEXT: v_mov_b32_e32 v0, 0
-; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000
+; GFX90A-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
+; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
; GFX90A-NEXT: v_mov_b32_e32 v2, s0
; GFX90A-NEXT: ds_add_f64 v2, v[0:1]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
+; GFX90A-NEXT: .LBB71_2:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush:
; GFX940: ; %bb.0: ; %main_body
+; GFX940-NEXT: s_mov_b64 s[2:3], exec
+; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT: s_cbranch_execz .LBB71_2
+; GFX940-NEXT: ; %bb.1:
; GFX940-NEXT: s_load_dword s0, s[0:1], 0x24
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], 4.0
+; GFX940-NEXT: s_bcnt1_i32_b64 s1, s[2:3]
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s1
+; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
; GFX940-NEXT: v_mov_b32_e32 v2, s0
; GFX940-NEXT: ds_add_f64 v2, v[0:1]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
+; GFX940-NEXT: .LBB71_2:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
@@ -2072,46 +2208,66 @@ main_body:
define amdgpu_kernel void @local_atomic_fadd_f64_noret_pat_flush_safe(ptr addrspace(3) %ptr) #4 {
; GFX90A-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
; GFX90A: ; %bb.0: ; %main_body
-; GFX90A-NEXT: s_load_dword s2, s[0:1], 0x24
-; GFX90A-NEXT: s_mov_b64 s[0:1], 0
+; GFX90A-NEXT: s_mov_b64 s[2:3], exec
+; GFX90A-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX90A-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX90A-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX90A-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX90A-NEXT: s_cbranch_execz .LBB72_3
+; GFX90A-NEXT: ; %bb.1:
+; GFX90A-NEXT: s_load_dword s4, s[0:1], 0x24
+; GFX90A-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_mov_b32_e32 v0, s2
-; GFX90A-NEXT: ds_read_b64 v[0:1], v0
-; GFX90A-NEXT: v_mov_b32_e32 v2, s2
-; GFX90A-NEXT: .LBB72_1: ; %atomicrmw.start
+; GFX90A-NEXT: v_mov_b32_e32 v0, s4
+; GFX90A-NEXT: ds_read_b64 v[2:3], v0
+; GFX90A-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX90A-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
+; GFX90A-NEXT: s_mov_b64 s[0:1], 0
+; GFX90A-NEXT: v_mov_b32_e32 v4, s4
+; GFX90A-NEXT: .LBB72_2: ; %atomicrmw.start
; GFX90A-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_add_f64 v[4:5], v[0:1], 4.0
-; GFX90A-NEXT: ds_cmpst_rtn_b64 v[4:5], v2, v[0:1], v[4:5]
+; GFX90A-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1]
+; GFX90A-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7]
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
-; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[0:1]
+; GFX90A-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
; GFX90A-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX90A-NEXT: v_pk_mov_b32 v[0:1], v[4:5], v[4:5] op_sel:[0,1]
+; GFX90A-NEXT: v_pk_mov_b32 v[2:3], v[6:7], v[6:7] op_sel:[0,1]
; GFX90A-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX90A-NEXT: s_cbranch_execnz .LBB72_1
-; GFX90A-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX90A-NEXT: s_cbranch_execnz .LBB72_2
+; GFX90A-NEXT: .LBB72_3:
; GFX90A-NEXT: s_endpgm
;
; GFX940-LABEL: local_atomic_fadd_f64_noret_pat_flush_safe:
; GFX940: ; %bb.0: ; %main_body
-; GFX940-NEXT: s_load_dword s2, s[0:1], 0x24
-; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: s_mov_b64 s[2:3], exec
+; GFX940-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX940-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX940-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX940-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX940-NEXT: s_cbranch_execz .LBB72_3
+; GFX940-NEXT: ; %bb.1:
+; GFX940-NEXT: s_load_dword s4, s[0:1], 0x24
+; GFX940-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_mov_b32_e32 v0, s2
-; GFX940-NEXT: ds_read_b64 v[0:1], v0
-; GFX940-NEXT: v_mov_b32_e32 v2, s2
-; GFX940-NEXT: .LBB72_1: ; %atomicrmw.start
+; GFX940-NEXT: v_mov_b32_e32 v0, s4
+; GFX940-NEXT: ds_read_b64 v[2:3], v0
+; GFX940-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX940-NEXT: v_mul_f64 v[0:1], v[0:1], 4.0
+; GFX940-NEXT: s_mov_b64 s[0:1], 0
+; GFX940-NEXT: v_mov_b32_e32 v4, s4
+; GFX940-NEXT: .LBB72_2: ; %atomicrmw.start
; GFX940-NEXT: ; =>This Inner Loop Header: Depth=1
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_add_f64 v[4:5], v[0:1], 4.0
-; GFX940-NEXT: ds_cmpst_rtn_b64 v[4:5], v2, v[0:1], v[4:5]
+; GFX940-NEXT: v_add_f64 v[6:7], v[2:3], v[0:1]
+; GFX940-NEXT: ds_cmpst_rtn_b64 v[6:7], v4, v[2:3], v[6:7]
; GFX940-NEXT: s_waitcnt lgkmcnt(0)
-; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[0:1]
+; GFX940-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
; GFX940-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
-; GFX940-NEXT: v_mov_b64_e32 v[0:1], v[4:5]
+; GFX940-NEXT: v_mov_b64_e32 v[2:3], v[6:7]
; GFX940-NEXT: s_andn2_b64 exec, exec, s[0:1]
-; GFX940-NEXT: s_cbranch_execnz .LBB72_1
-; GFX940-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX940-NEXT: s_cbranch_execnz .LBB72_2
+; GFX940-NEXT: .LBB72_3:
; GFX940-NEXT: s_endpgm
main_body:
%ret = atomicrmw fadd ptr addrspace(3) %ptr, double 4.0 seq_cst
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
new file mode 100644
index 0000000..66bf0d5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -0,0 +1,1502 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GISEL %s
+
+define i128 @fptosi_f64_to_i128(double %x) {
+; SDAG-LABEL: fptosi_f64_to_i128:
+; SDAG: ; %bb.0: ; %fp-to-i-entry
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v5, v1
+; SDAG-NEXT: v_bfe_u32 v6, v5, 20, 11
+; SDAG-NEXT: v_mov_b32_e32 v7, 0
+; SDAG-NEXT: s_mov_b64 s[4:5], 0x3fe
+; SDAG-NEXT: v_mov_b32_e32 v4, v0
+; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v2, 0
+; SDAG-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-NEXT: v_mov_b32_e32 v3, 0
+; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; SDAG-NEXT: s_cbranch_execz .LBB0_10
+; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end
+; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
+; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
+; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
+; SDAG-NEXT: s_movk_i32 s4, 0xff7f
+; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
+; SDAG-NEXT: s_mov_b32 s5, -1
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1]
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3]
+; SDAG-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5]
+; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7]
+; SDAG-NEXT: s_cbranch_execz .LBB0_7
+; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9
+; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0
+; SDAG-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, -1, s[4:5]
+; SDAG-NEXT: s_mov_b64 s[4:5], 0x432
+; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v11, -1, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v5, 0x100000, v0
+; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7]
+; SDAG-NEXT: s_cbranch_execz .LBB0_4
+; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else
+; SDAG-NEXT: v_sub_u32_e32 v0, 0x473, v6
+; SDAG-NEXT: v_add_u32_e32 v2, 0xfffffb8d, v6
+; SDAG-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6
+; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[4:5]
+; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[4:5]
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v7
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v7
+; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v1, s[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5]
+; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v1, s[4:5]
+; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v11, 0
+; SDAG-NEXT: v_mov_b32_e32 v3, 0
+; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2
+; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v2, 0
+; SDAG-NEXT: v_mov_b32_e32 v2, v1
+; SDAG-NEXT: v_mul_lo_u32 v6, v11, v6
+; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v11, v[2:3]
+; SDAG-NEXT: v_mul_lo_u32 v10, v10, v12
+; SDAG-NEXT: v_add3_u32 v5, v5, v6, v13
+; SDAG-NEXT: v_mov_b32_e32 v6, v2
+; SDAG-NEXT: v_mov_b32_e32 v2, v3
+; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v8, v[1:2]
+; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v12, v[4:5]
+; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v2
+; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_mul_lo_u32 v9, v9, v7
+; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6]
+; SDAG-NEXT: ; implicit-def: $vgpr11
+; SDAG-NEXT: ; implicit-def: $vgpr8
+; SDAG-NEXT: v_add3_u32 v4, v10, v4, v9
+; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v3
+; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5]
+; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
+; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; SDAG-NEXT: ; implicit-def: $vgpr9
+; SDAG-NEXT: ; implicit-def: $vgpr10
+; SDAG-NEXT: .LBB0_4: ; %Flow
+; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13]
+; SDAG-NEXT: s_cbranch_execz .LBB0_6
+; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12
+; SDAG-NEXT: v_sub_u32_e32 v2, 0x433, v6
+; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[4:5]
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2
+; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v6, v0, v4, s[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v5, v1, v5, s[6:7]
+; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v11, 0
+; SDAG-NEXT: v_mov_b32_e32 v2, 0
+; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v11, v[1:2]
+; SDAG-NEXT: v_mov_b32_e32 v7, v4
+; SDAG-NEXT: v_mov_b32_e32 v4, v2
+; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v8, v[3:4]
+; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v7, v2
+; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v8, v[2:3]
+; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, v[2:3]
+; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v6, v[3:4]
+; SDAG-NEXT: v_mad_i32_i24 v3, v9, v5, v3
+; SDAG-NEXT: .LBB0_6: ; %Flow1
+; SDAG-NEXT: s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT: .LBB0_7: ; %Flow2
+; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11]
+; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5
+; SDAG-NEXT: v_bfrev_b32_e32 v0, 1
+; SDAG-NEXT: v_bfrev_b32_e32 v1, -2
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
+; SDAG-NEXT: v_mov_b32_e32 v0, v2
+; SDAG-NEXT: v_mov_b32_e32 v1, v2
+; SDAG-NEXT: ; %bb.9: ; %Flow3
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT: .LBB0_10: ; %fp-to-i-cleanup
+; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: fptosi_f64_to_i128:
+; GISEL: ; %bb.0: ; %fp-to-i-entry
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v5, v1
+; GISEL-NEXT: v_mov_b32_e32 v4, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 20, v5
+; GISEL-NEXT: v_and_b32_e32 v6, 0x7ff, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff
+; GISEL-NEXT: s_mov_b64 s[4:5], 0
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: v_mov_b32_e32 v7, 0
+; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT: s_cbranch_execz .LBB0_10
+; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-end
+; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
+; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80
+; GISEL-NEXT: v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc
+; GISEL-NEXT: v_mov_b32_e32 v3, -1
+; GISEL-NEXT: v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7]
+; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT: v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7]
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_cmp_le_u64_e32 vcc, -1, v[8:9]
+; GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], -1, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, -1, v[8:9]
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7]
+; GISEL-NEXT: s_cbranch_execz .LBB0_7
+; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-end9
+; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7]
+; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v8, 3, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v9, 4, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v10, 5, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v11, 6, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v12, 7, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v13, 8, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v14, 9, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v15, 10, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v16, 11, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v17, 12, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v18, 13, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v19, 14, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v20, 15, v0
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v2
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v3
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v3
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v8
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v8
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v9
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v9
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v10
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v10
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v11
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v11
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v12
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v12
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v13
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v13
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v14
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v14
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v15
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v15
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v16
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v16
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v17
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v17
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v18
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v18
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v19
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v19
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v20
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v20
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GISEL-NEXT: v_lshl_or_b32 v10, v0, 16, v0
+; GISEL-NEXT: v_or3_b32 v8, v1, v2, 1
+; GISEL-NEXT: v_or3_b32 v9, v0, v2, 0
+; GISEL-NEXT: v_mov_b32_e32 v0, 0x433
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: v_and_b32_e32 v2, 0xfffff, v5
+; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GISEL-NEXT: v_or_b32_e32 v5, 0x100000, v2
+; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7]
+; GISEL-NEXT: s_cbranch_execz .LBB0_4
+; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else
+; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffbcd, v6
+; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5]
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0
+; GISEL-NEXT: v_subrev_u32_e32 v7, 64, v6
+; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6
+; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5]
+; GISEL-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0
+; GISEL-NEXT: v_mov_b32_e32 v2, v6
+; GISEL-NEXT: v_mul_lo_u32 v6, v11, v10
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v12, v10
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2]
+; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT: ; implicit-def: $vgpr10
+; GISEL-NEXT: ; implicit-def: $vgpr9
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
+; GISEL-NEXT: ; implicit-def: $vgpr6
+; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT: ; implicit-def: $vgpr8
+; GISEL-NEXT: .LBB0_4: ; %Flow
+; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[16:17]
+; GISEL-NEXT: s_cbranch_execz .LBB0_6
+; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12
+; GISEL-NEXT: v_sub_co_u32_e32 v6, vcc, 0x433, v6
+; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v6
+; GISEL-NEXT: v_lshrrev_b64 v[0:1], v6, v[4:5]
+; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, 0
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v4, v10, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3]
+; GISEL-NEXT: v_mul_lo_u32 v6, v5, v10
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v4, v10
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2]
+; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7]
+; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT: .LBB0_6: ; %Flow1
+; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT: .LBB0_7: ; %Flow2
+; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15]
+; GISEL-NEXT: s_cbranch_execz .LBB0_9
+; GISEL-NEXT: ; %bb.8: ; %fp-to-i-if-then5
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT: v_and_b32_e32 v1, 1, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v1
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v1
+; GISEL-NEXT: v_or_b32_e32 v2, v1, v2
+; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4
+; GISEL-NEXT: v_lshlrev_b32_e32 v5, 4, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6
+; GISEL-NEXT: v_lshlrev_b32_e32 v7, 6, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v8, 7, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6
+; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8
+; GISEL-NEXT: v_lshlrev_b32_e32 v9, 8, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v10, 9, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8
+; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10
+; GISEL-NEXT: v_lshlrev_b32_e32 v11, 10, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v12, 11, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10
+; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12
+; GISEL-NEXT: v_lshlrev_b32_e32 v13, 12, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v14, 13, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12
+; GISEL-NEXT: v_or3_b32 v0, v0, v13, v14
+; GISEL-NEXT: v_lshlrev_b32_e32 v15, 14, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v16, 15, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v13, v14
+; GISEL-NEXT: v_or3_b32 v0, v0, v15, v16
+; GISEL-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v18, 17, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16
+; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18
+; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18
+; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20
+; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4
+; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6
+; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6
+; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8
+; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8
+; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10
+; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10
+; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12
+; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12
+; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1
+; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1
+; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1
+; GISEL-NEXT: v_mov_b32_e32 v2, v1
+; GISEL-NEXT: .LBB0_9: ; %Flow3
+; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT: .LBB0_10: ; %fp-to-i-cleanup
+; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %cvt = fptosi double %x to i128
+ ret i128 %cvt
+}
+
+define i128 @fptoui_f64_to_i128(double %x) {
+; SDAG-LABEL: fptoui_f64_to_i128:
+; SDAG: ; %bb.0: ; %fp-to-i-entry
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v5, v1
+; SDAG-NEXT: v_bfe_u32 v6, v5, 20, 11
+; SDAG-NEXT: v_mov_b32_e32 v7, 0
+; SDAG-NEXT: s_mov_b64 s[4:5], 0x3fe
+; SDAG-NEXT: v_mov_b32_e32 v4, v0
+; SDAG-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v2, 0
+; SDAG-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-NEXT: v_mov_b32_e32 v3, 0
+; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; SDAG-NEXT: s_cbranch_execz .LBB1_10
+; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end
+; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
+; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v7, vcc
+; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v7, vcc
+; SDAG-NEXT: s_movk_i32 s4, 0xff7f
+; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v7, vcc
+; SDAG-NEXT: s_mov_b32 s5, -1
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1]
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3]
+; SDAG-NEXT: v_cmp_lt_i64_e32 vcc, -1, v[4:5]
+; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7]
+; SDAG-NEXT: s_cbranch_execz .LBB1_7
+; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9
+; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0
+; SDAG-NEXT: v_addc_co_u32_e64 v10, s[4:5], 0, -1, s[4:5]
+; SDAG-NEXT: s_mov_b64 s[4:5], 0x432
+; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v5
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v11, -1, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v5, 0x100000, v0
+; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7]
+; SDAG-NEXT: s_cbranch_execz .LBB1_4
+; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else
+; SDAG-NEXT: v_sub_u32_e32 v0, 0x473, v6
+; SDAG-NEXT: v_add_u32_e32 v2, 0xfffffb8d, v6
+; SDAG-NEXT: v_add_u32_e32 v7, 0xfffffbcd, v6
+; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[4:5]
+; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[4:5]
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v7
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v7
+; SDAG-NEXT: v_cndmask_b32_e64 v6, 0, v1, s[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5]
+; SDAG-NEXT: v_lshlrev_b64 v[0:1], v7, v[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v7, 0, v1, s[4:5]
+; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v12, v11, 0
+; SDAG-NEXT: v_mov_b32_e32 v3, 0
+; SDAG-NEXT: v_mul_lo_u32 v13, v8, v2
+; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v11, v2, 0
+; SDAG-NEXT: v_mov_b32_e32 v2, v1
+; SDAG-NEXT: v_mul_lo_u32 v6, v11, v6
+; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v7, v11, v[2:3]
+; SDAG-NEXT: v_mul_lo_u32 v10, v10, v12
+; SDAG-NEXT: v_add3_u32 v5, v5, v6, v13
+; SDAG-NEXT: v_mov_b32_e32 v6, v2
+; SDAG-NEXT: v_mov_b32_e32 v2, v3
+; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v12, v8, v[1:2]
+; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v9, v12, v[4:5]
+; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v2
+; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_mul_lo_u32 v9, v9, v7
+; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v7, v8, v[5:6]
+; SDAG-NEXT: ; implicit-def: $vgpr11
+; SDAG-NEXT: ; implicit-def: $vgpr8
+; SDAG-NEXT: v_add3_u32 v4, v10, v4, v9
+; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v3
+; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v4, s[4:5]
+; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
+; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; SDAG-NEXT: ; implicit-def: $vgpr9
+; SDAG-NEXT: ; implicit-def: $vgpr10
+; SDAG-NEXT: .LBB1_4: ; %Flow
+; SDAG-NEXT: s_andn2_saveexec_b64 s[12:13], s[12:13]
+; SDAG-NEXT: s_cbranch_execz .LBB1_6
+; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12
+; SDAG-NEXT: v_sub_u32_e32 v2, 0x433, v6
+; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[4:5]
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v2
+; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v1, 0, v1, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v6, v0, v4, s[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v5, v1, v5, s[6:7]
+; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v6, v11, 0
+; SDAG-NEXT: v_mov_b32_e32 v2, 0
+; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v5, v11, v[1:2]
+; SDAG-NEXT: v_mov_b32_e32 v7, v4
+; SDAG-NEXT: v_mov_b32_e32 v4, v2
+; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v6, v8, v[3:4]
+; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v7, v2
+; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v5, v8, v[2:3]
+; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v9, v6, v[2:3]
+; SDAG-NEXT: v_mad_u64_u32 v[3:4], s[4:5], v10, v6, v[3:4]
+; SDAG-NEXT: v_mad_i32_i24 v3, v9, v5, v3
+; SDAG-NEXT: .LBB1_6: ; %Flow1
+; SDAG-NEXT: s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT: .LBB1_7: ; %Flow2
+; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11]
+; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5
+; SDAG-NEXT: v_bfrev_b32_e32 v0, 1
+; SDAG-NEXT: v_bfrev_b32_e32 v1, -2
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
+; SDAG-NEXT: v_mov_b32_e32 v0, v2
+; SDAG-NEXT: v_mov_b32_e32 v1, v2
+; SDAG-NEXT: ; %bb.9: ; %Flow3
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT: .LBB1_10: ; %fp-to-i-cleanup
+; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: fptoui_f64_to_i128:
+; GISEL: ; %bb.0: ; %fp-to-i-entry
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v5, v1
+; GISEL-NEXT: v_mov_b32_e32 v4, v0
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 20, v5
+; GISEL-NEXT: v_and_b32_e32 v6, 0x7ff, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff
+; GISEL-NEXT: s_mov_b64 s[4:5], 0
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: v_mov_b32_e32 v7, 0
+; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT: s_cbranch_execz .LBB1_10
+; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-end
+; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xfffffb81, v6
+; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80
+; GISEL-NEXT: v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc
+; GISEL-NEXT: v_mov_b32_e32 v3, -1
+; GISEL-NEXT: v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7]
+; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT: v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7]
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_cmp_le_u64_e32 vcc, -1, v[8:9]
+; GISEL-NEXT: v_cmp_lt_i64_e64 s[4:5], -1, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, -1, v[8:9]
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7]
+; GISEL-NEXT: s_cbranch_execz .LBB1_7
+; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-end9
+; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7]
+; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v8, 3, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v9, 4, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v10, 5, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v11, 6, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v12, 7, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v13, 8, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v14, 9, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v15, 10, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v16, 11, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v17, 12, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v18, 13, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v19, 14, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v20, 15, v0
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v2
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v3
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v3
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v8
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v8
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v9
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v9
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v10
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v10
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v11
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v11
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v12
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v12
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v13
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v13
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v14
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v14
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v15
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v15
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v16
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v16
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v17
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v17
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v18
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v18
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v19
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v19
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v20
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v20
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GISEL-NEXT: v_lshl_or_b32 v10, v0, 16, v0
+; GISEL-NEXT: v_or3_b32 v8, v1, v2, 1
+; GISEL-NEXT: v_or3_b32 v9, v0, v2, 0
+; GISEL-NEXT: v_mov_b32_e32 v0, 0x433
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: v_and_b32_e32 v2, 0xfffff, v5
+; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GISEL-NEXT: v_or_b32_e32 v5, 0x100000, v2
+; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7]
+; GISEL-NEXT: s_cbranch_execz .LBB1_4
+; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else
+; GISEL-NEXT: v_add_u32_e32 v6, 0xfffffbcd, v6
+; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5]
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0
+; GISEL-NEXT: v_subrev_u32_e32 v7, 64, v6
+; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6
+; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5]
+; GISEL-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v9, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v8, 0
+; GISEL-NEXT: v_mov_b32_e32 v2, v6
+; GISEL-NEXT: v_mul_lo_u32 v6, v11, v10
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v9, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v12, v10
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v8, v[1:2]
+; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT: ; implicit-def: $vgpr10
+; GISEL-NEXT: ; implicit-def: $vgpr9
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v8, v[6:7]
+; GISEL-NEXT: ; implicit-def: $vgpr6
+; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT: ; implicit-def: $vgpr8
+; GISEL-NEXT: .LBB1_4: ; %Flow
+; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[16:17]
+; GISEL-NEXT: s_cbranch_execz .LBB1_6
+; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12
+; GISEL-NEXT: v_sub_co_u32_e32 v6, vcc, 0x433, v6
+; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v6
+; GISEL-NEXT: v_lshrrev_b64 v[0:1], v6, v[4:5]
+; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, 0
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v2, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v4, v10, 0
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v1, v5, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v4, v8, 0
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[6:7], v5, v9, v[2:3]
+; GISEL-NEXT: v_mul_lo_u32 v6, v5, v10
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v9, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v4, v10
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[6:7], v5, v8, v[1:2]
+; GISEL-NEXT: v_addc_co_u32_e64 v3, s[6:7], v3, v4, s[6:7]
+; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT: .LBB1_6: ; %Flow1
+; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT: .LBB1_7: ; %Flow2
+; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15]
+; GISEL-NEXT: s_cbranch_execz .LBB1_9
+; GISEL-NEXT: ; %bb.8: ; %fp-to-i-if-then5
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT: v_and_b32_e32 v1, 1, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v1
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v1
+; GISEL-NEXT: v_or_b32_e32 v2, v1, v2
+; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4
+; GISEL-NEXT: v_lshlrev_b32_e32 v5, 4, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6
+; GISEL-NEXT: v_lshlrev_b32_e32 v7, 6, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v8, 7, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6
+; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8
+; GISEL-NEXT: v_lshlrev_b32_e32 v9, 8, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v10, 9, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8
+; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10
+; GISEL-NEXT: v_lshlrev_b32_e32 v11, 10, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v12, 11, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10
+; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12
+; GISEL-NEXT: v_lshlrev_b32_e32 v13, 12, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v14, 13, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12
+; GISEL-NEXT: v_or3_b32 v0, v0, v13, v14
+; GISEL-NEXT: v_lshlrev_b32_e32 v15, 14, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v16, 15, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v13, v14
+; GISEL-NEXT: v_or3_b32 v0, v0, v15, v16
+; GISEL-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v18, 17, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16
+; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18
+; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18
+; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20
+; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4
+; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6
+; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6
+; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8
+; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8
+; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10
+; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10
+; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12
+; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12
+; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1
+; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1
+; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1
+; GISEL-NEXT: v_mov_b32_e32 v2, v1
+; GISEL-NEXT: .LBB1_9: ; %Flow3
+; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT: .LBB1_10: ; %fp-to-i-cleanup
+; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %cvt = fptoui double %x to i128
+ ret i128 %cvt
+}
+
+define i128 @fptosi_f32_to_i128(float %x) {
+; SDAG-LABEL: fptosi_f32_to_i128:
+; SDAG: ; %bb.0: ; %fp-to-i-entry
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v4, v0
+; SDAG-NEXT: v_bfe_u32 v5, v4, 23, 8
+; SDAG-NEXT: s_movk_i32 s4, 0x7e
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v2, 0
+; SDAG-NEXT: v_mov_b32_e32 v6, 0
+; SDAG-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-NEXT: v_mov_b32_e32 v3, 0
+; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5
+; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; SDAG-NEXT: s_cbranch_execz .LBB2_10
+; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end
+; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
+; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
+; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
+; SDAG-NEXT: s_movk_i32 s4, 0xff7f
+; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
+; SDAG-NEXT: s_mov_b32 s5, -1
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1]
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3]
+; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4
+; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7]
+; SDAG-NEXT: s_cbranch_execz .LBB2_7
+; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9
+; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0
+; SDAG-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, -1, s[4:5]
+; SDAG-NEXT: s_mov_b64 s[4:5], 0x95
+; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6]
+; SDAG-NEXT: v_mov_b32_e32 v7, 0
+; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v6, 0x800000, v0
+; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7]
+; SDAG-NEXT: s_cbranch_execz .LBB2_4
+; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else
+; SDAG-NEXT: v_sub_u32_e32 v0, 0xd6, v5
+; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff2a, v5
+; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff6a, v5
+; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[6:7]
+; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7]
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v4
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4
+; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5]
+; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5]
+; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v10, 0
+; SDAG-NEXT: v_mul_lo_u32 v14, v8, v2
+; SDAG-NEXT: v_mul_lo_u32 v15, v10, v3
+; SDAG-NEXT: v_mov_b32_e32 v6, v1
+; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[6:7]
+; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0
+; SDAG-NEXT: v_mov_b32_e32 v6, v5
+; SDAG-NEXT: v_mov_b32_e32 v5, v7
+; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v8, v[4:5]
+; SDAG-NEXT: v_add3_u32 v3, v3, v15, v14
+; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v13, v[2:3]
+; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v5
+; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_mul_lo_u32 v3, v9, v12
+; SDAG-NEXT: v_mul_lo_u32 v7, v11, v13
+; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v8, v[5:6]
+; SDAG-NEXT: ; implicit-def: $vgpr10
+; SDAG-NEXT: ; implicit-def: $vgpr8
+; SDAG-NEXT: ; implicit-def: $vgpr9
+; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3
+; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1
+; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5]
+; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6
+; SDAG-NEXT: v_mov_b32_e32 v1, v4
+; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
+; SDAG-NEXT: .LBB2_4: ; %Flow
+; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13]
+; SDAG-NEXT: s_cbranch_execz .LBB2_6
+; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12
+; SDAG-NEXT: v_sub_u32_e32 v2, 0x96, v5
+; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7]
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2
+; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v6, s[4:5]
+; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v10, 0
+; SDAG-NEXT: v_mov_b32_e32 v2, 0
+; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, v[1:2]
+; SDAG-NEXT: v_mov_b32_e32 v1, v5
+; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v9, v3, v[1:2]
+; SDAG-NEXT: v_mov_b32_e32 v1, v4
+; SDAG-NEXT: .LBB2_6: ; %Flow1
+; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT: .LBB2_7: ; %Flow2
+; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11]
+; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5
+; SDAG-NEXT: v_bfrev_b32_e32 v0, 1
+; SDAG-NEXT: v_bfrev_b32_e32 v1, -2
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
+; SDAG-NEXT: v_mov_b32_e32 v0, v2
+; SDAG-NEXT: v_mov_b32_e32 v1, v2
+; SDAG-NEXT: ; %bb.9: ; %Flow3
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT: .LBB2_10: ; %fp-to-i-cleanup
+; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: fptosi_f32_to_i128:
+; GISEL: ; %bb.0: ; %fp-to-i-entry
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v4, v0
+; GISEL-NEXT: v_mov_b32_e32 v5, 0
+; GISEL-NEXT: v_lshrrev_b64 v[0:1], 23, v[4:5]
+; GISEL-NEXT: s_mov_b64 s[4:5], 0
+; GISEL-NEXT: v_bfe_u32 v6, v0, 0, 8
+; GISEL-NEXT: v_mov_b32_e32 v0, 0x7f
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT: s_cbranch_execz .LBB2_10
+; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-end
+; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v6
+; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80
+; GISEL-NEXT: v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc
+; GISEL-NEXT: v_mov_b32_e32 v3, -1
+; GISEL-NEXT: v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7]
+; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT: v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7]
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_cmp_le_u64_e32 vcc, -1, v[8:9]
+; GISEL-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, -1, v[8:9]
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7]
+; GISEL-NEXT: s_cbranch_execz .LBB2_7
+; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-end9
+; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7]
+; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v5, 3, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v8, 4, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v9, 5, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v10, 6, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v11, 7, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v12, 8, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v13, 9, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v14, 10, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v15, 11, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v16, 12, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v17, 13, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v18, 14, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v19, 15, v0
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v2
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v3
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v3
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v5
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v5
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v8
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v8
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v9
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v9
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v10
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v10
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v11
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v11
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v12
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v12
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v13
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v13
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v14
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v14
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v15
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v15
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v16
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v16
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v17
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v17
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v18
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v18
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v19
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v19
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GISEL-NEXT: v_lshl_or_b32 v10, v0, 16, v0
+; GISEL-NEXT: v_or3_b32 v9, v1, v2, 1
+; GISEL-NEXT: v_or3_b32 v8, v0, v2, 0
+; GISEL-NEXT: v_mov_b32_e32 v0, 0x96
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4
+; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GISEL-NEXT: v_or_b32_e32 v4, 0x800000, v2
+; GISEL-NEXT: v_mov_b32_e32 v5, 0
+; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7]
+; GISEL-NEXT: s_cbranch_execz .LBB2_4
+; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else
+; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff6a, v6
+; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5]
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0
+; GISEL-NEXT: v_subrev_u32_e32 v7, 64, v6
+; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6
+; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5]
+; GISEL-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v9, 0
+; GISEL-NEXT: v_mov_b32_e32 v2, v6
+; GISEL-NEXT: v_mul_lo_u32 v6, v11, v10
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v8, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v12, v10
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v9, v[1:2]
+; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT: ; implicit-def: $vgpr10
+; GISEL-NEXT: ; implicit-def: $vgpr8
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7]
+; GISEL-NEXT: ; implicit-def: $vgpr6
+; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT: ; implicit-def: $vgpr9
+; GISEL-NEXT: .LBB2_4: ; %Flow
+; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17]
+; GISEL-NEXT: s_cbranch_execz .LBB2_6
+; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12
+; GISEL-NEXT: v_sub_co_u32_e32 v3, vcc, 0x96, v6
+; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v3
+; GISEL-NEXT: v_lshrrev_b64 v[0:1], v3, v[4:5]
+; GISEL-NEXT: v_lshrrev_b64 v[1:2], v2, 0
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v10, 0
+; GISEL-NEXT: v_mul_lo_u32 v5, v4, v10
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2]
+; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT: .LBB2_6: ; %Flow1
+; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT: .LBB2_7: ; %Flow2
+; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15]
+; GISEL-NEXT: s_cbranch_execz .LBB2_9
+; GISEL-NEXT: ; %bb.8: ; %fp-to-i-if-then5
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT: v_and_b32_e32 v1, 1, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v1
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v1
+; GISEL-NEXT: v_or_b32_e32 v2, v1, v2
+; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4
+; GISEL-NEXT: v_lshlrev_b32_e32 v5, 4, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6
+; GISEL-NEXT: v_lshlrev_b32_e32 v7, 6, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v8, 7, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6
+; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8
+; GISEL-NEXT: v_lshlrev_b32_e32 v9, 8, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v10, 9, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8
+; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10
+; GISEL-NEXT: v_lshlrev_b32_e32 v11, 10, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v12, 11, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10
+; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12
+; GISEL-NEXT: v_lshlrev_b32_e32 v13, 12, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v14, 13, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12
+; GISEL-NEXT: v_or3_b32 v0, v0, v13, v14
+; GISEL-NEXT: v_lshlrev_b32_e32 v15, 14, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v16, 15, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v13, v14
+; GISEL-NEXT: v_or3_b32 v0, v0, v15, v16
+; GISEL-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v18, 17, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16
+; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18
+; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18
+; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20
+; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4
+; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6
+; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6
+; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8
+; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8
+; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10
+; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10
+; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12
+; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12
+; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1
+; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1
+; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1
+; GISEL-NEXT: v_mov_b32_e32 v2, v1
+; GISEL-NEXT: .LBB2_9: ; %Flow3
+; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT: .LBB2_10: ; %fp-to-i-cleanup
+; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %cvt = fptosi float %x to i128
+ ret i128 %cvt
+}
+
+define i128 @fptoui_f32_to_i128(float %x) {
+; SDAG-LABEL: fptoui_f32_to_i128:
+; SDAG: ; %bb.0: ; %fp-to-i-entry
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v4, v0
+; SDAG-NEXT: v_bfe_u32 v5, v4, 23, 8
+; SDAG-NEXT: s_movk_i32 s4, 0x7e
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v2, 0
+; SDAG-NEXT: v_mov_b32_e32 v6, 0
+; SDAG-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-NEXT: v_mov_b32_e32 v3, 0
+; SDAG-NEXT: v_cmp_lt_u32_e32 vcc, s4, v5
+; SDAG-NEXT: s_and_saveexec_b64 s[8:9], vcc
+; SDAG-NEXT: s_cbranch_execz .LBB3_10
+; SDAG-NEXT: ; %bb.1: ; %fp-to-i-if-end
+; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v5
+; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, -1, v6, vcc
+; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, -1, v6, vcc
+; SDAG-NEXT: s_movk_i32 s4, 0xff7f
+; SDAG-NEXT: v_addc_co_u32_e32 v3, vcc, -1, v6, vcc
+; SDAG-NEXT: s_mov_b32 s5, -1
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[0:1]
+; SDAG-NEXT: v_cmp_eq_u64_e64 s[6:7], -1, v[2:3]
+; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, -1, v4
+; SDAG-NEXT: s_and_b64 s[4:5], s[6:7], s[4:5]
+; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[6:7]
+; SDAG-NEXT: s_cbranch_execz .LBB3_7
+; SDAG-NEXT: ; %bb.2: ; %fp-to-i-if-end9
+; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT: v_add_co_u32_e64 v9, s[4:5], -1, v0
+; SDAG-NEXT: v_addc_co_u32_e64 v11, s[4:5], 0, -1, s[4:5]
+; SDAG-NEXT: s_mov_b64 s[4:5], 0x95
+; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v4
+; SDAG-NEXT: v_cmp_lt_u64_e64 s[4:5], s[4:5], v[5:6]
+; SDAG-NEXT: v_mov_b32_e32 v7, 0
+; SDAG-NEXT: v_cndmask_b32_e64 v8, -1, 0, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v10, -1, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v6, 0x800000, v0
+; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], s[4:5]
+; SDAG-NEXT: s_xor_b64 s[12:13], exec, s[6:7]
+; SDAG-NEXT: s_cbranch_execz .LBB3_4
+; SDAG-NEXT: ; %bb.3: ; %fp-to-i-if-else
+; SDAG-NEXT: v_sub_u32_e32 v0, 0xd6, v5
+; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff2a, v5
+; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff6a, v5
+; SDAG-NEXT: v_lshrrev_b64 v[0:1], v0, v[6:7]
+; SDAG-NEXT: v_lshlrev_b64 v[2:3], v2, v[6:7]
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v4
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v3, v1, s[4:5]
+; SDAG-NEXT: v_cmp_ne_u32_e64 s[6:7], 0, v4
+; SDAG-NEXT: v_cndmask_b32_e64 v3, 0, v1, s[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[4:5]
+; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, v2, s[6:7]
+; SDAG-NEXT: v_cndmask_b32_e64 v13, 0, v0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v12, 0, v1, s[4:5]
+; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v13, v10, 0
+; SDAG-NEXT: v_mul_lo_u32 v14, v8, v2
+; SDAG-NEXT: v_mul_lo_u32 v15, v10, v3
+; SDAG-NEXT: v_mov_b32_e32 v6, v1
+; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v12, v10, v[6:7]
+; SDAG-NEXT: v_mad_u64_u32 v[2:3], s[4:5], v10, v2, 0
+; SDAG-NEXT: v_mov_b32_e32 v6, v5
+; SDAG-NEXT: v_mov_b32_e32 v5, v7
+; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v13, v8, v[4:5]
+; SDAG-NEXT: v_add3_u32 v3, v3, v15, v14
+; SDAG-NEXT: v_mad_u64_u32 v[1:2], s[4:5], v9, v13, v[2:3]
+; SDAG-NEXT: v_add_co_u32_e64 v5, s[4:5], v6, v5
+; SDAG-NEXT: v_addc_co_u32_e64 v6, s[4:5], 0, 0, s[4:5]
+; SDAG-NEXT: v_mul_lo_u32 v3, v9, v12
+; SDAG-NEXT: v_mul_lo_u32 v7, v11, v13
+; SDAG-NEXT: v_mad_u64_u32 v[5:6], s[4:5], v12, v8, v[5:6]
+; SDAG-NEXT: ; implicit-def: $vgpr10
+; SDAG-NEXT: ; implicit-def: $vgpr8
+; SDAG-NEXT: ; implicit-def: $vgpr9
+; SDAG-NEXT: v_add3_u32 v3, v7, v2, v3
+; SDAG-NEXT: v_add_co_u32_e64 v2, s[4:5], v5, v1
+; SDAG-NEXT: v_addc_co_u32_e64 v3, s[4:5], v6, v3, s[4:5]
+; SDAG-NEXT: ; implicit-def: $vgpr5_vgpr6
+; SDAG-NEXT: v_mov_b32_e32 v1, v4
+; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
+; SDAG-NEXT: .LBB3_4: ; %Flow
+; SDAG-NEXT: s_andn2_saveexec_b64 s[6:7], s[12:13]
+; SDAG-NEXT: s_cbranch_execz .LBB3_6
+; SDAG-NEXT: ; %bb.5: ; %fp-to-i-if-then12
+; SDAG-NEXT: v_sub_u32_e32 v2, 0x96, v5
+; SDAG-NEXT: v_lshrrev_b64 v[0:1], v2, v[6:7]
+; SDAG-NEXT: v_cmp_gt_u32_e64 s[4:5], 64, v2
+; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, v0, s[4:5]
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v2
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v0, v6, s[4:5]
+; SDAG-NEXT: v_mad_u64_u32 v[0:1], s[4:5], v3, v10, 0
+; SDAG-NEXT: v_mov_b32_e32 v2, 0
+; SDAG-NEXT: v_mad_u64_u32 v[4:5], s[4:5], v3, v8, v[1:2]
+; SDAG-NEXT: v_mov_b32_e32 v1, v5
+; SDAG-NEXT: v_mad_i64_i32 v[2:3], s[4:5], v9, v3, v[1:2]
+; SDAG-NEXT: v_mov_b32_e32 v1, v4
+; SDAG-NEXT: .LBB3_6: ; %Flow1
+; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT: .LBB3_7: ; %Flow2
+; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11]
+; SDAG-NEXT: ; %bb.8: ; %fp-to-i-if-then5
+; SDAG-NEXT: v_bfrev_b32_e32 v0, 1
+; SDAG-NEXT: v_bfrev_b32_e32 v1, -2
+; SDAG-NEXT: v_cndmask_b32_e64 v2, 0, -1, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v3, v0, v1, vcc
+; SDAG-NEXT: v_mov_b32_e32 v0, v2
+; SDAG-NEXT: v_mov_b32_e32 v1, v2
+; SDAG-NEXT: ; %bb.9: ; %Flow3
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT: .LBB3_10: ; %fp-to-i-cleanup
+; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: fptoui_f32_to_i128:
+; GISEL: ; %bb.0: ; %fp-to-i-entry
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v4, v0
+; GISEL-NEXT: v_mov_b32_e32 v5, 0
+; GISEL-NEXT: v_lshrrev_b64 v[0:1], 23, v[4:5]
+; GISEL-NEXT: s_mov_b64 s[4:5], 0
+; GISEL-NEXT: v_bfe_u32 v6, v0, 0, 8
+; GISEL-NEXT: v_mov_b32_e32 v0, 0x7f
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: v_mov_b32_e32 v7, v5
+; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GISEL-NEXT: s_mov_b64 s[6:7], s[4:5]
+; GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GISEL-NEXT: v_mov_b32_e32 v2, s6
+; GISEL-NEXT: v_mov_b32_e32 v3, s7
+; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT: s_cbranch_execz .LBB3_10
+; GISEL-NEXT: ; %bb.1: ; %fp-to-i-if-end
+; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 0xffffff01, v6
+; GISEL-NEXT: v_mov_b32_e32 v2, 0xffffff80
+; GISEL-NEXT: v_addc_co_u32_e64 v1, s[6:7], 0, -1, vcc
+; GISEL-NEXT: v_mov_b32_e32 v3, -1
+; GISEL-NEXT: v_addc_co_u32_e64 v8, s[6:7], 0, -1, s[6:7]
+; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[0:1], v[2:3]
+; GISEL-NEXT: v_addc_co_u32_e64 v9, s[6:7], 0, -1, s[6:7]
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_cmp_le_u64_e32 vcc, -1, v[8:9]
+; GISEL-NEXT: v_cmp_lt_i32_e64 s[4:5], -1, v4
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, vcc
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, -1, v[8:9]
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0
+; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT: s_xor_b64 s[14:15], exec, s[6:7]
+; GISEL-NEXT: s_cbranch_execz .LBB3_7
+; GISEL-NEXT: ; %bb.2: ; %fp-to-i-if-end9
+; GISEL-NEXT: s_xor_b64 s[6:7], s[4:5], -1
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, -1, s[6:7]
+; GISEL-NEXT: v_and_b32_e32 v0, 1, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v2, 1, v0
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, 1, s[6:7]
+; GISEL-NEXT: v_lshlrev_b16_e32 v3, 2, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v5, 3, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v8, 4, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v9, 5, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v10, 6, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v11, 7, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v12, 8, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v13, 9, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v14, 10, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v15, 11, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v16, 12, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v17, 13, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v18, 14, v0
+; GISEL-NEXT: v_lshlrev_b16_e32 v19, 15, v0
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v2
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v3
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v3
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v5
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v5
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v8
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v8
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v9
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v9
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v10
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v10
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v11
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v11
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v12
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v12
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v13
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v13
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v14
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v14
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v15
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v15
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v16
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v16
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v17
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v17
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v18
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v18
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v19
+; GISEL-NEXT: v_or_b32_e32 v1, v1, v19
+; GISEL-NEXT: v_and_b32_e32 v0, 0xffff, v0
+; GISEL-NEXT: v_and_b32_e32 v1, 0xffff, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 16, v0
+; GISEL-NEXT: v_lshl_or_b32 v10, v0, 16, v0
+; GISEL-NEXT: v_or3_b32 v9, v1, v2, 1
+; GISEL-NEXT: v_or3_b32 v8, v0, v2, 0
+; GISEL-NEXT: v_mov_b32_e32 v0, 0x96
+; GISEL-NEXT: v_mov_b32_e32 v1, 0
+; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4
+; GISEL-NEXT: v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
+; GISEL-NEXT: v_or_b32_e32 v4, 0x800000, v2
+; GISEL-NEXT: v_mov_b32_e32 v5, 0
+; GISEL-NEXT: ; implicit-def: $vgpr0_vgpr1_vgpr2_vgpr3
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT: s_xor_b64 s[16:17], exec, s[6:7]
+; GISEL-NEXT: s_cbranch_execz .LBB3_4
+; GISEL-NEXT: ; %bb.3: ; %fp-to-i-if-else
+; GISEL-NEXT: v_add_u32_e32 v6, 0xffffff6a, v6
+; GISEL-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5]
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v12, 0, v1, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[6:7], v11, v10, 0
+; GISEL-NEXT: v_subrev_u32_e32 v7, 64, v6
+; GISEL-NEXT: v_sub_u32_e32 v2, 64, v6
+; GISEL-NEXT: v_lshrrev_b64 v[2:3], v2, v[4:5]
+; GISEL-NEXT: v_lshlrev_b64 v[4:5], v7, v[4:5]
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[6:7], 0, v6
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v12, v8, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e32 v2, v4, v2, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v13, v2, 0, s[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v9, v[6:7]
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v11, v9, 0
+; GISEL-NEXT: v_mov_b32_e32 v2, v6
+; GISEL-NEXT: v_mul_lo_u32 v6, v11, v10
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[8:9], v11, v8, v[1:2]
+; GISEL-NEXT: v_mul_lo_u32 v4, v12, v10
+; GISEL-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], s[10:11], v12, v9, v[1:2]
+; GISEL-NEXT: v_addc_co_u32_e64 v6, s[10:11], v7, v6, s[10:11]
+; GISEL-NEXT: v_addc_co_u32_e64 v4, s[8:9], v6, v4, s[8:9]
+; GISEL-NEXT: v_mad_u64_u32 v[6:7], s[8:9], v13, v8, v[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v3, v3, 0, s[6:7]
+; GISEL-NEXT: ; implicit-def: $vgpr10
+; GISEL-NEXT: ; implicit-def: $vgpr8
+; GISEL-NEXT: v_mad_u64_u32 v[3:4], s[6:7], v3, v9, v[6:7]
+; GISEL-NEXT: ; implicit-def: $vgpr6
+; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT: ; implicit-def: $vgpr9
+; GISEL-NEXT: .LBB3_4: ; %Flow
+; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[16:17]
+; GISEL-NEXT: s_cbranch_execz .LBB3_6
+; GISEL-NEXT: ; %bb.5: ; %fp-to-i-if-then12
+; GISEL-NEXT: v_sub_co_u32_e32 v3, vcc, 0x96, v6
+; GISEL-NEXT: v_subrev_u32_e32 v2, 64, v3
+; GISEL-NEXT: v_lshrrev_b64 v[0:1], v3, v[4:5]
+; GISEL-NEXT: v_lshrrev_b64 v[1:2], v2, 0
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v3
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v0, v4, vcc
+; GISEL-NEXT: v_mad_u64_u32 v[0:1], s[8:9], v4, v9, 0
+; GISEL-NEXT: v_mad_u64_u32 v[2:3], s[8:9], v4, v10, 0
+; GISEL-NEXT: v_mul_lo_u32 v5, v4, v10
+; GISEL-NEXT: v_mad_u64_u32 v[1:2], vcc, v4, v8, v[1:2]
+; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, v3, v5, vcc
+; GISEL-NEXT: .LBB3_6: ; %Flow1
+; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT: .LBB3_7: ; %Flow2
+; GISEL-NEXT: s_andn2_saveexec_b64 s[6:7], s[14:15]
+; GISEL-NEXT: s_cbranch_execz .LBB3_9
+; GISEL-NEXT: ; %bb.8: ; %fp-to-i-if-then5
+; GISEL-NEXT: v_cndmask_b32_e64 v1, 0, -1, s[4:5]
+; GISEL-NEXT: v_and_b32_e32 v1, 1, v1
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5]
+; GISEL-NEXT: v_lshlrev_b32_e32 v2, 1, v1
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 2, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 3, v1
+; GISEL-NEXT: v_or_b32_e32 v2, v1, v2
+; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4
+; GISEL-NEXT: v_lshlrev_b32_e32 v5, 4, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v6, 5, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6
+; GISEL-NEXT: v_lshlrev_b32_e32 v7, 6, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v8, 7, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6
+; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8
+; GISEL-NEXT: v_lshlrev_b32_e32 v9, 8, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v10, 9, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8
+; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10
+; GISEL-NEXT: v_lshlrev_b32_e32 v11, 10, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v12, 11, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10
+; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12
+; GISEL-NEXT: v_lshlrev_b32_e32 v13, 12, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v14, 13, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12
+; GISEL-NEXT: v_or3_b32 v0, v0, v13, v14
+; GISEL-NEXT: v_lshlrev_b32_e32 v15, 14, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v16, 15, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v13, v14
+; GISEL-NEXT: v_or3_b32 v0, v0, v15, v16
+; GISEL-NEXT: v_lshlrev_b32_e32 v17, 16, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v18, 17, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v15, v16
+; GISEL-NEXT: v_or3_b32 v0, v0, v17, v18
+; GISEL-NEXT: v_lshlrev_b32_e32 v19, 18, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v20, 19, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v17, v18
+; GISEL-NEXT: v_or3_b32 v0, v0, v19, v20
+; GISEL-NEXT: v_lshlrev_b32_e32 v3, 20, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v4, 21, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v19, v20
+; GISEL-NEXT: v_or3_b32 v0, v0, v3, v4
+; GISEL-NEXT: v_lshlrev_b32_e32 v5, 22, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v6, 23, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v3, v4
+; GISEL-NEXT: v_or3_b32 v0, v0, v5, v6
+; GISEL-NEXT: v_lshlrev_b32_e32 v7, 24, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v8, 25, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v5, v6
+; GISEL-NEXT: v_or3_b32 v0, v0, v7, v8
+; GISEL-NEXT: v_lshlrev_b32_e32 v9, 26, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v10, 27, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v7, v8
+; GISEL-NEXT: v_or3_b32 v0, v0, v9, v10
+; GISEL-NEXT: v_lshlrev_b32_e32 v11, 28, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v12, 29, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v9, v10
+; GISEL-NEXT: v_or3_b32 v0, v0, v11, v12
+; GISEL-NEXT: v_lshlrev_b32_e32 v13, 30, v1
+; GISEL-NEXT: v_lshlrev_b32_e32 v1, 31, v1
+; GISEL-NEXT: v_or3_b32 v2, v2, v11, v12
+; GISEL-NEXT: v_or3_b32 v0, v0, v13, v1
+; GISEL-NEXT: v_or3_b32 v1, v2, v13, v1
+; GISEL-NEXT: v_add_u32_e32 v3, 0x80000000, v1
+; GISEL-NEXT: v_mov_b32_e32 v2, v1
+; GISEL-NEXT: .LBB3_9: ; %Flow3
+; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT: .LBB3_10: ; %fp-to-i-cleanup
+; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %cvt = fptoui float %x to i128
+ ret i128 %cvt
+}
+
+define i128 @fptosi_f16_to_i128(half %x) {
+; GCN-LABEL: fptosi_f16_to_i128:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_cvt_i32_f32_e32 v0, v0
+; GCN-NEXT: v_ashrrev_i32_e32 v1, 31, v0
+; GCN-NEXT: v_mov_b32_e32 v2, v1
+; GCN-NEXT: v_mov_b32_e32 v3, v1
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %cvt = fptosi half %x to i128
+ ret i128 %cvt
+}
+
+define i128 @fptoui_f16_to_i128(half %x) {
+; GCN-LABEL: fptoui_f16_to_i128:
+; GCN: ; %bb.0:
+; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GCN-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GCN-NEXT: v_mov_b32_e32 v1, 0
+; GCN-NEXT: v_mov_b32_e32 v2, 0
+; GCN-NEXT: v_mov_b32_e32 v3, 0
+; GCN-NEXT: v_cvt_u32_f32_e32 v0, v0
+; GCN-NEXT: s_setpc_b64 s[30:31]
+ %cvt = fptoui half %x to i128
+ ret i128 %cvt
+}
+
+; FIXME: ExpandLargeFpConvert asserts on bfloat
+; define i128 @fptosi_bf16_to_i128(bfloat %x) {
+; %cvt = fptosi bfloat %x to i128
+; ret i128 %cvt
+; }
+
+; define i128 @fptoui_bf16_to_i128(bfloat %x) {
+; %cvt = fptoui bfloat %x to i128
+; ret i128 %cvt
+; }
diff --git a/llvm/test/CodeGen/AMDGPU/fract-match.ll b/llvm/test/CodeGen/AMDGPU/fract-match.ll
index 3a0b825..e361aa4 100644
--- a/llvm/test/CodeGen/AMDGPU/fract-match.ll
+++ b/llvm/test/CodeGen/AMDGPU/fract-match.ll
@@ -1705,16 +1705,16 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap
; GFX6-NEXT: v_min_f32_e32 v7, 0x3f7fffff, v7
; GFX6-NEXT: v_cndmask_b32_e32 v6, v6, v1, vcc
; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX6-NEXT: s_movk_i32 s10, 0x204
+; GFX6-NEXT: v_mov_b32_e32 v8, 0x204
; GFX6-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc
-; GFX6-NEXT: v_cmp_class_f32_e64 s[8:9], v0, s10
+; GFX6-NEXT: v_cmp_class_f32_e32 vcc, v0, v8
; GFX6-NEXT: s_mov_b32 s6, 0
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v7, 0, s[8:9]
-; GFX6-NEXT: v_cmp_class_f32_e64 s[8:9], v1, s10
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v7, 0, vcc
+; GFX6-NEXT: v_cmp_class_f32_e32 vcc, v1, v8
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v6, 0, s[8:9]
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v6, 0, vcc
; GFX6-NEXT: buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -1722,19 +1722,19 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap
; GFX7-LABEL: safe_math_fract_v2f32:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_mov_b32 s8, 0x7f800000
+; GFX7-NEXT: v_mov_b32_e32 v8, 0x204
; GFX7-NEXT: v_fract_f32_e32 v6, v0
-; GFX7-NEXT: v_cmp_neq_f32_e64 vcc, |v0|, s8
+; GFX7-NEXT: v_cmp_class_f32_e32 vcc, v0, v8
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: v_floor_f32_e32 v4, v0
; GFX7-NEXT: v_fract_f32_e32 v7, v1
-; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX7-NEXT: v_cmp_neq_f32_e64 vcc, |v1|, s8
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX7-NEXT: v_cmp_class_f32_e32 vcc, v1, v8
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
; GFX7-NEXT: v_floor_f32_e32 v5, v1
-; GFX7-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v7, 0, vcc
; GFX7-NEXT: buffer_store_dwordx2 v[4:5], v[2:3], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
@@ -1742,15 +1742,15 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap
; GFX8-LABEL: safe_math_fract_v2f32:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_mov_b32 s4, 0x7f800000
+; GFX8-NEXT: v_mov_b32_e32 v8, 0x204
; GFX8-NEXT: v_fract_f32_e32 v6, v0
-; GFX8-NEXT: v_cmp_neq_f32_e64 vcc, |v0|, s4
+; GFX8-NEXT: v_cmp_class_f32_e32 vcc, v0, v8
; GFX8-NEXT: v_floor_f32_e32 v4, v0
; GFX8-NEXT: v_fract_f32_e32 v7, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc
-; GFX8-NEXT: v_cmp_neq_f32_e64 vcc, |v1|, s4
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v6, 0, vcc
+; GFX8-NEXT: v_cmp_class_f32_e32 vcc, v1, v8
; GFX8-NEXT: v_floor_f32_e32 v5, v1
-; GFX8-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v7, 0, vcc
; GFX8-NEXT: global_store_dwordx2 v[2:3], v[4:5], off
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
@@ -1759,14 +1759,15 @@ define <2 x float> @safe_math_fract_v2f32(<2 x float> %x, ptr addrspace(1) nocap
; GFX11: ; %bb.0: ; %entry
; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX11-NEXT: v_fract_f32_e32 v6, v0
-; GFX11-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v0|
+; GFX11-NEXT: v_cmp_class_f32_e64 s0, v0, 0x204
; GFX11-NEXT: v_fract_f32_e32 v7, v1
; GFX11-NEXT: v_floor_f32_e32 v4, v0
; GFX11-NEXT: v_floor_f32_e32 v5, v1
-; GFX11-NEXT: v_cndmask_b32_e32 v0, 0, v6, vcc_lo
-; GFX11-NEXT: v_cmp_neq_f32_e64 vcc_lo, 0x7f800000, |v1|
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX11-NEXT: v_cndmask_b32_e64 v0, v6, 0, s0
+; GFX11-NEXT: v_cmp_class_f32_e64 s0, v1, 0x204
; GFX11-NEXT: global_store_b64 v[2:3], v[4:5], off
-; GFX11-NEXT: v_cndmask_b32_e32 v1, 0, v7, vcc_lo
+; GFX11-NEXT: v_cndmask_b32_e64 v1, v7, 0, s0
; GFX11-NEXT: s_setpc_b64 s[30:31]
entry:
%floor = tail call <2 x float> @llvm.floor.v2f32(<2 x float> %x)
@@ -1937,21 +1938,22 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) nocapture writeonly %
; GFX6: ; %bb.0: ; %entry
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: s_mov_b32 s8, 0x7f800000
+; GFX6-NEXT: s_movk_i32 s8, 0x7c00
; GFX6-NEXT: s_mov_b32 s6, 0
; GFX6-NEXT: s_mov_b32 s7, 0xf000
-; GFX6-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX6-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: v_floor_f32_e32 v3, v0
-; GFX6-NEXT: v_sub_f32_e32 v4, v0, v3
-; GFX6-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX6-NEXT: v_min_f32_e32 v4, 0x3f7fe000, v4
-; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
-; GFX6-NEXT: v_cmp_neq_f32_e64 vcc, |v0|, s8
-; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
-; GFX6-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64
+; GFX6-NEXT: v_floor_f32_e32 v4, v3
+; GFX6-NEXT: v_sub_f32_e32 v5, v3, v4
+; GFX6-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX6-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5
+; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX6-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, s8, v0
+; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc
+; GFX6-NEXT: buffer_store_short v4, v[1:2], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
;
@@ -1959,21 +1961,22 @@ define half @safe_math_fract_f16(half %x, ptr addrspace(1) nocapture writeonly %
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_mov_b32 s8, 0x7f800000
+; GFX7-NEXT: s_movk_i32 s8, 0x7c00
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: v_cvt_f32_f16_e32 v0, v0
+; GFX7-NEXT: v_cvt_f32_f16_e32 v3, v0
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: v_floor_f32_e32 v3, v0
-; GFX7-NEXT: v_sub_f32_e32 v4, v0, v3
-; GFX7-NEXT: v_cvt_f16_f32_e32 v3, v3
-; GFX7-NEXT: v_min_f32_e32 v4, 0x3f7fe000, v4
-; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v0, v0
-; GFX7-NEXT: v_cndmask_b32_e32 v4, v4, v0, vcc
-; GFX7-NEXT: v_cmp_neq_f32_e64 vcc, |v0|, s8
-; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v4, vcc
-; GFX7-NEXT: buffer_store_short v3, v[1:2], s[4:7], 0 addr64
+; GFX7-NEXT: v_floor_f32_e32 v4, v3
+; GFX7-NEXT: v_sub_f32_e32 v5, v3, v4
+; GFX7-NEXT: v_cvt_f16_f32_e32 v4, v4
+; GFX7-NEXT: v_min_f32_e32 v5, 0x3f7fe000, v5
+; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v3, v3
+; GFX7-NEXT: v_cndmask_b32_e32 v3, v5, v3, vcc
+; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s8, v0
+; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v3, vcc
+; GFX7-NEXT: buffer_store_short v4, v[1:2], s[4:7], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
@@ -2062,12 +2065,12 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX6-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX6-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX6-NEXT: s_mov_b32 s8, 0x7f800000
+; GFX6-NEXT: s_movk_i32 s8, 0x7c00
; GFX6-NEXT: s_mov_b32 s6, 0
; GFX6-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX6-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX6-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GFX6-NEXT: v_cvt_f32_f16_e64 v1, |v1|
+; GFX6-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX6-NEXT: v_and_b32_e32 v1, 0x7fff, v1
; GFX6-NEXT: v_floor_f32_e32 v6, v4
; GFX6-NEXT: v_cvt_f16_f32_e32 v7, v6
; GFX6-NEXT: v_floor_f32_e32 v8, v5
@@ -2080,10 +2083,10 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu
; GFX6-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX6-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX6-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
-; GFX6-NEXT: v_cmp_neq_f32_e32 vcc, s8, v0
+; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, s8, v0
; GFX6-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX6-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc
-; GFX6-NEXT: v_cmp_neq_f32_e32 vcc, s8, v1
+; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, s8, v1
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
@@ -2098,12 +2101,12 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX7-NEXT: v_cvt_f16_f32_e32 v1, v1
; GFX7-NEXT: v_cvt_f16_f32_e32 v0, v0
-; GFX7-NEXT: s_mov_b32 s8, 0x7f800000
+; GFX7-NEXT: s_movk_i32 s8, 0x7c00
; GFX7-NEXT: s_mov_b32 s6, 0
; GFX7-NEXT: v_cvt_f32_f16_e32 v4, v1
; GFX7-NEXT: v_cvt_f32_f16_e32 v5, v0
-; GFX7-NEXT: v_cvt_f32_f16_e64 v0, |v0|
-; GFX7-NEXT: v_cvt_f32_f16_e64 v1, |v1|
+; GFX7-NEXT: v_and_b32_e32 v0, 0x7fff, v0
+; GFX7-NEXT: v_and_b32_e32 v1, 0x7fff, v1
; GFX7-NEXT: v_floor_f32_e32 v6, v4
; GFX7-NEXT: v_cvt_f16_f32_e32 v7, v6
; GFX7-NEXT: v_floor_f32_e32 v8, v5
@@ -2116,10 +2119,10 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu
; GFX7-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc
; GFX7-NEXT: v_cmp_u_f32_e32 vcc, v5, v5
; GFX7-NEXT: v_cndmask_b32_e32 v5, v8, v5, vcc
-; GFX7-NEXT: v_cmp_neq_f32_e32 vcc, s8, v0
+; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s8, v0
; GFX7-NEXT: v_lshlrev_b32_e32 v7, 16, v7
; GFX7-NEXT: v_cndmask_b32_e32 v0, 0, v5, vcc
-; GFX7-NEXT: v_cmp_neq_f32_e32 vcc, s8, v1
+; GFX7-NEXT: v_cmp_ne_u32_e32 vcc, s8, v1
; GFX7-NEXT: s_mov_b32 s7, 0xf000
; GFX7-NEXT: s_mov_b32 s4, s6
; GFX7-NEXT: s_mov_b32 s5, s6
@@ -2133,16 +2136,16 @@ define <2 x half> @safe_math_fract_v2f16(<2 x half> %x, ptr addrspace(1) nocaptu
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v0
-; GFX8-NEXT: s_movk_i32 s6, 0x204
+; GFX8-NEXT: v_mov_b32_e32 v7, 0x204
; GFX8-NEXT: v_floor_f16_e32 v4, v3
; GFX8-NEXT: v_floor_f16_e32 v5, v0
; GFX8-NEXT: v_fract_f16_e32 v6, v3
-; GFX8-NEXT: v_cmp_class_f16_e64 s[4:5], v3, s6
+; GFX8-NEXT: v_cmp_class_f16_e32 vcc, v3, v7
; GFX8-NEXT: v_pack_b32_f16 v4, v5, v4
; GFX8-NEXT: v_fract_f16_e32 v5, v0
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, 0, s[4:5]
-; GFX8-NEXT: v_cmp_class_f16_e64 s[4:5], v0, s6
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, 0, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, 0, vcc
+; GFX8-NEXT: v_cmp_class_f16_e32 vcc, v0, v7
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v5, 0, vcc
; GFX8-NEXT: v_pack_b32_f16 v0, v0, v3
; GFX8-NEXT: global_store_dword v[1:2], v4, off
; GFX8-NEXT: s_waitcnt vmcnt(0)
@@ -2237,19 +2240,19 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) noc
; GFX6-NEXT: v_cndmask_b32_e32 v11, v11, v3, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v10, v10, v2, vcc
; GFX6-NEXT: v_cmp_u_f64_e32 vcc, v[0:1], v[0:1]
-; GFX6-NEXT: s_movk_i32 s10, 0x204
-; GFX6-NEXT: v_cmp_class_f64_e64 s[8:9], v[0:1], s10
+; GFX6-NEXT: v_mov_b32_e32 v14, 0x204
; GFX6-NEXT: v_cndmask_b32_e32 v13, v13, v1, vcc
; GFX6-NEXT: v_cndmask_b32_e32 v12, v12, v0, vcc
-; GFX6-NEXT: v_cndmask_b32_e64 v0, v12, 0, s[8:9]
-; GFX6-NEXT: v_cndmask_b32_e64 v1, v13, 0, s[8:9]
-; GFX6-NEXT: v_cmp_class_f64_e64 s[8:9], v[2:3], s10
+; GFX6-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v14
; GFX6-NEXT: s_mov_b32 s6, 0
+; GFX6-NEXT: v_cndmask_b32_e64 v0, v12, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v1, v13, 0, vcc
+; GFX6-NEXT: v_cmp_class_f64_e32 vcc, v[2:3], v14
; GFX6-NEXT: s_mov_b32 s7, 0xf000
; GFX6-NEXT: s_mov_b32 s4, s6
; GFX6-NEXT: s_mov_b32 s5, s6
-; GFX6-NEXT: v_cndmask_b32_e64 v2, v10, 0, s[8:9]
-; GFX6-NEXT: v_cndmask_b32_e64 v3, v11, 0, s[8:9]
+; GFX6-NEXT: v_cndmask_b32_e64 v2, v10, 0, vcc
+; GFX6-NEXT: v_cndmask_b32_e64 v3, v11, 0, vcc
; GFX6-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[4:7], 0 addr64
; GFX6-NEXT: s_waitcnt vmcnt(0) expcnt(0)
; GFX6-NEXT: s_setpc_b64 s[30:31]
@@ -2257,39 +2260,39 @@ define <2 x double> @safe_math_fract_v2f64(<2 x double> %x, ptr addrspace(1) noc
; GFX7-LABEL: safe_math_fract_v2f64:
; GFX7: ; %bb.0: ; %entry
; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX7-NEXT: s_movk_i32 s4, 0x204
+; GFX7-NEXT: v_mov_b32_e32 v6, 0x204
; GFX7-NEXT: v_fract_f64_e32 v[10:11], v[0:1]
-; GFX7-NEXT: v_cmp_class_f64_e64 s[8:9], v[0:1], s4
+; GFX7-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v6
; GFX7-NEXT: v_fract_f64_e32 v[12:13], v[2:3]
-; GFX7-NEXT: v_cmp_class_f64_e64 s[10:11], v[2:3], s4
+; GFX7-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v6
; GFX7-NEXT: v_floor_f64_e32 v[8:9], v[2:3]
; GFX7-NEXT: v_floor_f64_e32 v[6:7], v[0:1]
-; GFX7-NEXT: s_mov_b32 s6, 0
-; GFX7-NEXT: s_mov_b32 s7, 0xf000
-; GFX7-NEXT: s_mov_b32 s4, s6
-; GFX7-NEXT: s_mov_b32 s5, s6
-; GFX7-NEXT: v_cndmask_b32_e64 v0, v10, 0, s[8:9]
-; GFX7-NEXT: v_cndmask_b32_e64 v1, v11, 0, s[8:9]
-; GFX7-NEXT: v_cndmask_b32_e64 v2, v12, 0, s[10:11]
-; GFX7-NEXT: v_cndmask_b32_e64 v3, v13, 0, s[10:11]
-; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[4:7], 0 addr64
+; GFX7-NEXT: s_mov_b32 s10, 0
+; GFX7-NEXT: s_mov_b32 s11, 0xf000
+; GFX7-NEXT: s_mov_b32 s8, s10
+; GFX7-NEXT: s_mov_b32 s9, s10
+; GFX7-NEXT: v_cndmask_b32_e64 v0, v10, 0, vcc
+; GFX7-NEXT: v_cndmask_b32_e64 v1, v11, 0, vcc
+; GFX7-NEXT: v_cndmask_b32_e64 v2, v12, 0, s[4:5]
+; GFX7-NEXT: v_cndmask_b32_e64 v3, v13, 0, s[4:5]
+; GFX7-NEXT: buffer_store_dwordx4 v[6:9], v[4:5], s[8:11], 0 addr64
; GFX7-NEXT: s_waitcnt vmcnt(0)
; GFX7-NEXT: s_setpc_b64 s[30:31]
;
; GFX8-LABEL: safe_math_fract_v2f64:
; GFX8: ; %bb.0: ; %entry
; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX8-NEXT: s_movk_i32 s6, 0x204
+; GFX8-NEXT: v_mov_b32_e32 v6, 0x204
; GFX8-NEXT: v_fract_f64_e32 v[10:11], v[0:1]
-; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[0:1], s6
+; GFX8-NEXT: v_cmp_class_f64_e32 vcc, v[0:1], v6
; GFX8-NEXT: v_fract_f64_e32 v[12:13], v[2:3]
-; GFX8-NEXT: v_cmp_class_f64_e64 s[6:7], v[2:3], s6
+; GFX8-NEXT: v_cmp_class_f64_e64 s[4:5], v[2:3], v6
; GFX8-NEXT: v_floor_f64_e32 v[8:9], v[2:3]
; GFX8-NEXT: v_floor_f64_e32 v[6:7], v[0:1]
-; GFX8-NEXT: v_cndmask_b32_e64 v0, v10, 0, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v1, v11, 0, s[4:5]
-; GFX8-NEXT: v_cndmask_b32_e64 v2, v12, 0, s[6:7]
-; GFX8-NEXT: v_cndmask_b32_e64 v3, v13, 0, s[6:7]
+; GFX8-NEXT: v_cndmask_b32_e64 v0, v10, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v1, v11, 0, vcc
+; GFX8-NEXT: v_cndmask_b32_e64 v2, v12, 0, s[4:5]
+; GFX8-NEXT: v_cndmask_b32_e64 v3, v13, 0, s[4:5]
; GFX8-NEXT: global_store_dwordx4 v[4:5], v[6:9], off
; GFX8-NEXT: s_waitcnt vmcnt(0)
; GFX8-NEXT: s_setpc_b64 s[30:31]
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
index e3fada3..b717280 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomic_optimizer_fp_rtn.ll
@@ -1,71 +1,43 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR-ITERATIVE %s
-; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR-DPP %s
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -passes='amdgpu-atomic-optimizer<strategy=iterative>,verify<domtree>' %s | FileCheck --check-prefixes=IR,IR-ITERATIVE %s
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -passes='amdgpu-atomic-optimizer<strategy=dpp>,verify<domtree>' %s | FileCheck --check-prefixes=IR,IR-DPP %s
+
+; Tests various combinations of uniform/divergent address and uniform/divergent value inputs of various types for atomic operations.
+; Optimization remains same for Iterative and DPP strategies when value in uniform. These different scan/reduction
+; strategies are valid for only divergent values. This optimization is valid for divergent addresses. Test also covers different scopes.
define amdgpu_ps float @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(
-; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]]
-; IR-ITERATIVE: 2:
-; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
-; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
-; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]])
-; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
-; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float
-; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]]
-; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
-; IR-ITERATIVE: 14:
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT: br label [[TMP16]]
-; IR-ITERATIVE: 16:
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
-; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]])
-; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
-; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float
-; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]]
-; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = fadd float [[TMP20]], [[TMP22]]
-; IR-ITERATIVE-NEXT: br label [[TMP24]]
-; IR-ITERATIVE: 24:
-; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ]
-; IR-ITERATIVE-NEXT: ret float [[TMP25]]
-;
-; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(
-; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]]
-; IR-DPP: 2:
-; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
-; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
-; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]])
-; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
-; IR-DPP-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float
-; IR-DPP-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]]
-; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
-; IR-DPP: 14:
-; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT: br label [[TMP16]]
-; IR-DPP: 16:
-; IR-DPP-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
-; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
-; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]])
-; IR-DPP-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
-; IR-DPP-NEXT: [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float
-; IR-DPP-NEXT: [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]]
-; IR-DPP-NEXT: [[TMP23:%.*]] = fadd float [[TMP20]], [[TMP22]]
-; IR-DPP-NEXT: br label [[TMP24]]
-; IR-DPP: 24:
-; IR-DPP-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ]
-; IR-DPP-NEXT: ret float [[TMP25]]
+; IR-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(
+; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP24:%.*]]
+; IR: 2:
+; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]])
+; IR-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
+; IR-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float
+; IR-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]]
+; IR-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
+; IR: 14:
+; IR-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4
+; IR-NEXT: br label [[TMP16]]
+; IR: 16:
+; IR-NEXT: [[TMP17:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
+; IR-NEXT: [[TMP18:%.*]] = bitcast float [[TMP17]] to i32
+; IR-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP18]])
+; IR-NEXT: [[TMP20:%.*]] = bitcast i32 [[TMP19]] to float
+; IR-NEXT: [[TMP21:%.*]] = uitofp i32 [[TMP8]] to float
+; IR-NEXT: [[TMP22:%.*]] = fmul float [[VAL]], [[TMP21]]
+; IR-NEXT: [[TMP23:%.*]] = fadd float [[TMP20]], [[TMP22]]
+; IR-NEXT: br label [[TMP24]]
+; IR: 24:
+; IR-NEXT: [[TMP25:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP23]], [[TMP16]] ]
+; IR-NEXT: ret float [[TMP25]]
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
ret float %result
@@ -411,7 +383,6 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_uni_value_agent_scope_str
ret float %result
}
-
define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 {
; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
@@ -514,61 +485,33 @@ define amdgpu_ps float @global_atomic_fsub_uni_address_div_value_agent_scope_str
}
define amdgpu_ps float @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(
-; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]]
-; IR-ITERATIVE: 2:
-; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
-; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
-; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
-; IR-ITERATIVE: 10:
-; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT: br label [[TMP12]]
-; IR-ITERATIVE: 12:
-; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]])
-; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
-; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = uitofp i32 [[TMP8]] to float
-; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = select i1 [[TMP9]], float 0x7FF0000000000000, float [[VAL]]
-; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP18]])
-; IR-ITERATIVE-NEXT: br label [[TMP20]]
-; IR-ITERATIVE: 20:
-; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP19]], [[TMP12]] ]
-; IR-ITERATIVE-NEXT: ret float [[TMP21]]
-;
-; IR-DPP-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(
-; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]]
-; IR-DPP: 2:
-; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
-; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
-; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-DPP-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-DPP-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
-; IR-DPP: 10:
-; IR-DPP-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT: br label [[TMP12]]
-; IR-DPP: 12:
-; IR-DPP-NEXT: [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
-; IR-DPP-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
-; IR-DPP-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]])
-; IR-DPP-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
-; IR-DPP-NEXT: [[TMP17:%.*]] = uitofp i32 [[TMP8]] to float
-; IR-DPP-NEXT: [[TMP18:%.*]] = select i1 [[TMP9]], float 0x7FF0000000000000, float [[VAL]]
-; IR-DPP-NEXT: [[TMP19:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP18]])
-; IR-DPP-NEXT: br label [[TMP20]]
-; IR-DPP: 20:
-; IR-DPP-NEXT: [[TMP21:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP19]], [[TMP12]] ]
-; IR-DPP-NEXT: ret float [[TMP21]]
+; IR-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(
+; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP20:%.*]]
+; IR: 2:
+; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR: 10:
+; IR-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT: br label [[TMP12]]
+; IR: 12:
+; IR-NEXT: [[TMP13:%.*]] = phi float [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
+; IR-NEXT: [[TMP14:%.*]] = bitcast float [[TMP13]] to i32
+; IR-NEXT: [[TMP15:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP14]])
+; IR-NEXT: [[TMP16:%.*]] = bitcast i32 [[TMP15]] to float
+; IR-NEXT: [[TMP17:%.*]] = uitofp i32 [[TMP8]] to float
+; IR-NEXT: [[TMP18:%.*]] = select i1 [[TMP9]], float 0x7FF0000000000000, float [[VAL]]
+; IR-NEXT: [[TMP19:%.*]] = call float @llvm.minnum.f32(float [[TMP16]], float [[TMP18]])
+; IR-NEXT: br label [[TMP20]]
+; IR: 20:
+; IR-NEXT: [[TMP21:%.*]] = phi float [ poison, [[TMP0:%.*]] ], [ [[TMP19]], [[TMP12]] ]
+; IR-NEXT: ret float [[TMP21]]
;
%result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
ret float %result
@@ -1007,164 +950,674 @@ define amdgpu_ps float @global_atomic_fadd_uni_address_div_value_system_scope_st
ret float %result
}
-
define amdgpu_ps float @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float inreg %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(
-; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT: ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(
-; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT: ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT: ret float [[RESULT]]
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
ret float %result
}
define amdgpu_ps float @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(
-; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT: ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(
-; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT: ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT: ret float [[RESULT]]
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
ret float %result
}
define amdgpu_ps float @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(
-; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
-; IR-ITERATIVE-NEXT: ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(
-; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
-; IR-DPP-NEXT: ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-NEXT: ret float [[RESULT]]
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
ret float %result
}
define amdgpu_ps float @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(
-; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
-; IR-ITERATIVE-NEXT: ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(
-; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
-; IR-DPP-NEXT: ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-NEXT: ret float [[RESULT]]
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
ret float %result
}
define amdgpu_ps float @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 {
-; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(
-; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT: ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(
-; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT: ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT: ret float [[RESULT]]
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
ret float %result
}
-
define amdgpu_ps float @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 {
-; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(
-; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT: ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(
-; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT: ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT: ret float [[RESULT]]
;
%result = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
ret float %result
}
define amdgpu_ps float @global_atomic_fmin_div_address_uni_value_agent_scope(ptr addrspace(1) %ptr, float inreg %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope(
-; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT: ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope(
-; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT: ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT: ret float [[RESULT]]
;
%result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
ret float %result
}
define amdgpu_ps float @global_atomic_fmin_div_address_div_value_agent_scope(ptr addrspace(1) %ptr, float %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope(
-; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT: ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope(
-; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT: ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT: ret float [[RESULT]]
;
%result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
ret float %result
}
define amdgpu_ps float @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1{
-; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(
-; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT: ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(
-; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT: ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT: ret float [[RESULT]]
;
%result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
ret float %result
}
define amdgpu_ps float @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1{
-; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(
-; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT: ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(
-; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT: ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT: ret float [[RESULT]]
;
%result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
ret float %result
}
define amdgpu_ps float @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(
-; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
-; IR-ITERATIVE-NEXT: ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(
-; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
-; IR-DPP-NEXT: ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-NEXT: ret float [[RESULT]]
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4
ret float %result
}
define amdgpu_ps float @global_atomic_fadd_div_address_div_value_system_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp(
-; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
-; IR-ITERATIVE-NEXT: ret float [[RESULT]]
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp(
-; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
-; IR-DPP-NEXT: ret float [[RESULT]]
+; IR-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-NEXT: ret float [[RESULT]]
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4
ret float %result
}
+define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double inreg %val) #0 {
+; IR-LABEL: @global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe(
+; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]]
+; IR: 2:
+; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]])
+; IR-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
+; IR-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to double
+; IR-NEXT: [[TMP12:%.*]] = fmul double [[VAL:%.*]], [[TMP11]]
+; IR-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
+; IR: 14:
+; IR-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("agent") monotonic, align 4
+; IR-NEXT: br label [[TMP16]]
+; IR: 16:
+; IR-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
+; IR-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64
+; IR-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
+; IR-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
+; IR-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
+; IR-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]])
+; IR-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]])
+; IR-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
+; IR-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
+; IR-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
+; IR-NEXT: [[TMP27:%.*]] = uitofp i32 [[TMP8]] to double
+; IR-NEXT: [[TMP28:%.*]] = fmul double [[VAL]], [[TMP27]]
+; IR-NEXT: [[TMP29:%.*]] = fadd double [[TMP26]], [[TMP28]]
+; IR-NEXT: br label [[TMP30]]
+; IR: 30:
+; IR-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ]
+; IR-NEXT: ret double [[TMP31]]
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4
+ ret double %result
+}
+
+define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 {
+; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT: ret double [[RESULT]]
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4
+ ret double %result
+}
+
+define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double inreg %val) #1 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
+; IR-ITERATIVE: 14:
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("one-as") monotonic, align 8
+; IR-ITERATIVE-NEXT: br label [[TMP16]]
+; IR-ITERATIVE: 16:
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
+; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
+; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[TMP30]]
+; IR-ITERATIVE: 30:
+; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ]
+; IR-ITERATIVE-NEXT: ret double [[TMP31]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
+; IR-DPP: 14:
+; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("one-as") monotonic, align 8
+; IR-DPP-NEXT: br label [[TMP16]]
+; IR-DPP: 16:
+; IR-DPP-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
+; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64
+; IR-DPP-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
+; IR-DPP-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
+; IR-DPP-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
+; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
+; IR-DPP-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
+; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
+; IR-DPP-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: br label [[TMP30]]
+; IR-DPP: 30:
+; IR-DPP-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ]
+; IR-DPP-NEXT: ret double [[TMP31]]
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic
+ ret double %result
+}
+
+define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double %val) #1 {
+; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8
+; IR-NEXT: ret double [[RESULT]]
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic
+ ret double %result
+}
+
+define amdgpu_ps double @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, double inreg %val) #2 {
+; IR-ITERATIVE-LABEL: @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
+; IR-ITERATIVE: 14:
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("agent") monotonic, align 8
+; IR-ITERATIVE-NEXT: br label [[TMP16]]
+; IR-ITERATIVE: 16:
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
+; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
+; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[TMP30]]
+; IR-ITERATIVE: 30:
+; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ]
+; IR-ITERATIVE-NEXT: ret double [[TMP31]]
+;
+; IR-DPP-LABEL: @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
+; IR-DPP: 14:
+; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("agent") monotonic, align 8
+; IR-DPP-NEXT: br label [[TMP16]]
+; IR-DPP: 16:
+; IR-DPP-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
+; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64
+; IR-DPP-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
+; IR-DPP-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
+; IR-DPP-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
+; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
+; IR-DPP-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
+; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
+; IR-DPP-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: br label [[TMP30]]
+; IR-DPP: 30:
+; IR-DPP-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ]
+; IR-DPP-NEXT: ret double [[TMP31]]
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
+ ret double %result
+}
+
+define amdgpu_ps double @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 {
+; IR-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
+; IR-NEXT: ret double [[RESULT]]
+;
+ %result = atomicrmw fsub ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
+ ret double %result
+}
+
+define amdgpu_ps double @global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double inreg %val) #0 {
+; IR-LABEL: @global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe(
+; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP26:%.*]]
+; IR: 2:
+; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR: 10:
+; IR-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
+; IR-NEXT: br label [[TMP12]]
+; IR: 12:
+; IR-NEXT: [[TMP13:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
+; IR-NEXT: [[TMP14:%.*]] = bitcast double [[TMP13]] to i64
+; IR-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
+; IR-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 32
+; IR-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
+; IR-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP15]])
+; IR-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP17]])
+; IR-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
+; IR-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1
+; IR-NEXT: [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double
+; IR-NEXT: [[TMP23:%.*]] = uitofp i32 [[TMP8]] to double
+; IR-NEXT: [[TMP24:%.*]] = select i1 [[TMP9]], double 0x7FF0000000000000, double [[VAL]]
+; IR-NEXT: [[TMP25:%.*]] = call double @llvm.minnum.f64(double [[TMP22]], double [[TMP24]])
+; IR-NEXT: br label [[TMP26]]
+; IR: 26:
+; IR-NEXT: [[TMP27:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP25]], [[TMP12]] ]
+; IR-NEXT: ret double [[TMP27]]
+;
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
+ ret double %result
+}
+
+define amdgpu_ps double @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 {
+; IR-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
+; IR-NEXT: ret double [[RESULT]]
+;
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
+ ret double %result
+}
+
+define amdgpu_ps double @global_atomic__fmax_double_uni_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double inreg %val) #1{
+; IR-ITERATIVE-LABEL: @global_atomic__fmax_double_uni_address_uni_value_agent_scope_unsafe_structfp(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP26:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
+; IR-ITERATIVE-NEXT: br label [[TMP12]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
+; IR-ITERATIVE-NEXT: [[TMP14:%.*]] = bitcast double [[TMP13]] to i64
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
+; IR-ITERATIVE-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 32
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP15]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP17]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = select i1 [[TMP9]], double 0xFFF0000000000000, double [[VAL]]
+; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP22]], double [[TMP24]], metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[TMP26]]
+; IR-ITERATIVE: 26:
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP25]], [[TMP12]] ]
+; IR-ITERATIVE-NEXT: ret double [[TMP27]]
+;
+; IR-DPP-LABEL: @global_atomic__fmax_double_uni_address_uni_value_agent_scope_unsafe_structfp(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP26:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR-DPP: 10:
+; IR-DPP-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
+; IR-DPP-NEXT: br label [[TMP12]]
+; IR-DPP: 12:
+; IR-DPP-NEXT: [[TMP13:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP11]], [[TMP10]] ]
+; IR-DPP-NEXT: [[TMP14:%.*]] = bitcast double [[TMP13]] to i64
+; IR-DPP-NEXT: [[TMP15:%.*]] = trunc i64 [[TMP14]] to i32
+; IR-DPP-NEXT: [[TMP16:%.*]] = lshr i64 [[TMP14]], 32
+; IR-DPP-NEXT: [[TMP17:%.*]] = trunc i64 [[TMP16]] to i32
+; IR-DPP-NEXT: [[TMP18:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP15]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP19:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP17]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP20:%.*]] = insertelement <2 x i32> poison, i32 [[TMP18]], i32 0
+; IR-DPP-NEXT: [[TMP21:%.*]] = insertelement <2 x i32> [[TMP20]], i32 [[TMP19]], i32 1
+; IR-DPP-NEXT: [[TMP22:%.*]] = bitcast <2 x i32> [[TMP21]] to double
+; IR-DPP-NEXT: [[TMP23:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = select i1 [[TMP9]], double 0xFFF0000000000000, double [[VAL]]
+; IR-DPP-NEXT: [[TMP25:%.*]] = call double @llvm.experimental.constrained.maxnum.f64(double [[TMP22]], double [[TMP24]], metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: br label [[TMP26]]
+; IR-DPP: 26:
+; IR-DPP-NEXT: [[TMP27:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP25]], [[TMP12]] ]
+; IR-DPP-NEXT: ret double [[TMP27]]
+;
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
+ ret double %result
+}
+
+define amdgpu_ps double @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double %val) #1{
+; IR-LABEL: @global_atomic__fmax_double_uni_address_div_value_agent_scope_unsafe_structfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
+; IR-NEXT: ret double [[RESULT]]
+;
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
+ ret double %result
+}
+
+define amdgpu_ps double @global_atomic_fadd_double_uni_address_uni_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, double inreg %val) #2 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_uni_value_system_scope_strictfp(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
+; IR-ITERATIVE: 14:
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] monotonic, align 4
+; IR-ITERATIVE-NEXT: br label [[TMP16]]
+; IR-ITERATIVE: 16:
+; IR-ITERATIVE-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
+; IR-ITERATIVE-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64
+; IR-ITERATIVE-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
+; IR-ITERATIVE-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
+; IR-ITERATIVE-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
+; IR-ITERATIVE-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
+; IR-ITERATIVE-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
+; IR-ITERATIVE-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
+; IR-ITERATIVE-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br label [[TMP30]]
+; IR-ITERATIVE: 30:
+; IR-ITERATIVE-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ]
+; IR-ITERATIVE-NEXT: ret double [[TMP31]]
+;
+; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_uni_value_system_scope_strictfp(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP30:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
+; IR-DPP: 14:
+; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] monotonic, align 4
+; IR-DPP-NEXT: br label [[TMP16]]
+; IR-DPP: 16:
+; IR-DPP-NEXT: [[TMP17:%.*]] = phi double [ poison, [[TMP2]] ], [ [[TMP15]], [[TMP14]] ]
+; IR-DPP-NEXT: [[TMP18:%.*]] = bitcast double [[TMP17]] to i64
+; IR-DPP-NEXT: [[TMP19:%.*]] = trunc i64 [[TMP18]] to i32
+; IR-DPP-NEXT: [[TMP20:%.*]] = lshr i64 [[TMP18]], 32
+; IR-DPP-NEXT: [[TMP21:%.*]] = trunc i64 [[TMP20]] to i32
+; IR-DPP-NEXT: [[TMP22:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP19]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP23:%.*]] = call i32 @llvm.amdgcn.readfirstlane(i32 [[TMP21]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP24:%.*]] = insertelement <2 x i32> poison, i32 [[TMP22]], i32 0
+; IR-DPP-NEXT: [[TMP25:%.*]] = insertelement <2 x i32> [[TMP24]], i32 [[TMP23]], i32 1
+; IR-DPP-NEXT: [[TMP26:%.*]] = bitcast <2 x i32> [[TMP25]] to double
+; IR-DPP-NEXT: [[TMP27:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP8]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP28:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL]], double [[TMP27]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP29:%.*]] = call double @llvm.experimental.constrained.fadd.f64(double [[TMP26]], double [[TMP28]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: br label [[TMP30]]
+; IR-DPP: 30:
+; IR-DPP-NEXT: [[TMP31:%.*]] = phi double [ poison, [[TMP0:%.*]] ], [ [[TMP29]], [[TMP16]] ]
+; IR-DPP-NEXT: ret double [[TMP31]]
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4
+ ret double %result
+}
+
+define amdgpu_ps double @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 {
+; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4
+; IR-NEXT: ret double [[RESULT]]
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4
+ ret double %result
+}
+
+define amdgpu_ps double @global_atomic_fadd_double_div_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr, double inreg %val) #0 {
+; IR-LABEL: @global_atomic_fadd_double_div_address_uni_value_agent_scope_unsafe(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT: ret double [[RESULT]]
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4
+ ret double %result
+}
+
+define amdgpu_ps double @global_atomic_fadd_double_div_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr, double %val) #0 {
+; IR-LABEL: @global_atomic_fadd_double_div_address_div_value_agent_scope_unsafe(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT: ret double [[RESULT]]
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4
+ ret double %result
+}
+
+define amdgpu_ps double @global_atomic_fadd_double_div_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, double inreg %val) #1 {
+; IR-LABEL: @global_atomic_fadd_double_div_address_uni_value_one_as_scope_unsafe_structfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8
+; IR-NEXT: ret double [[RESULT]]
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic
+ ret double %result
+}
+
+define amdgpu_ps double @global_atomic_fadd_double_div_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, double %val) #1 {
+; IR-LABEL: @global_atomic_fadd_double_div_address_div_value_one_as_scope_unsafe_structfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8
+; IR-NEXT: ret double [[RESULT]]
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic
+ ret double %result
+}
+
+define amdgpu_ps double @global_atomic_fsub_double_div_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr, double inreg %val) #2 {
+; IR-LABEL: @global_atomic_fsub_double_div_address_uni_value_agent_scope_strictfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
+; IR-NEXT: ret double [[RESULT]]
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
+ ret double %result
+}
+
+define amdgpu_ps double @global_atomic_fsub_double_div_address_div_value_agent_scope_strictfp(ptr addrspace(1) %ptr, double %val) #2 {
+; IR-LABEL: @global_atomic_fsub_double_div_address_div_value_agent_scope_strictfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
+; IR-NEXT: ret double [[RESULT]]
+;
+ %result = atomicrmw fsub ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
+ ret double %result
+}
+
+define amdgpu_ps double @global_atomic_fmin_double_div_address_uni_value_agent_scope(ptr addrspace(1) %ptr, double inreg %val) #0 {
+; IR-LABEL: @global_atomic_fmin_double_div_address_uni_value_agent_scope(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
+; IR-NEXT: ret double [[RESULT]]
+;
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
+ ret double %result
+}
+
+define amdgpu_ps double @global_atomic_fmin_double_div_address_div_value_agent_scope(ptr addrspace(1) %ptr, double %val) #0 {
+; IR-LABEL: @global_atomic_fmin_double_div_address_div_value_agent_scope(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
+; IR-NEXT: ret double [[RESULT]]
+;
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
+ ret double %result
+}
+
+define amdgpu_ps double @global_atomic__fmax_double_div_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, double inreg %val) #1{
+; IR-LABEL: @global_atomic__fmax_double_div_address_uni_value_agent_scope_unsafe_structfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
+; IR-NEXT: ret double [[RESULT]]
+;
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
+ ret double %result
+}
+
+define amdgpu_ps double @global_atomic__fmax_double_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, double %val) #1{
+; IR-LABEL: @global_atomic__fmax_double_div_address_div_value_agent_scope_unsafe_structfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
+; IR-NEXT: ret double [[RESULT]]
+;
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
+ ret double %result
+}
+
+define amdgpu_ps double @global_atomic_fadd_double_div_address_uni_value_system_scope_strictfp(ptr addrspace(1) %ptr, double inreg %val) #2 {
+; IR-LABEL: @global_atomic_fadd_double_div_address_uni_value_system_scope_strictfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4
+; IR-NEXT: ret double [[RESULT]]
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4
+ ret double %result
+}
+
+define amdgpu_ps double @global_atomic_fadd_double_div_address_div_value_system_scope_strictfp(ptr addrspace(1) %ptr, double %val) #2 {
+; IR-LABEL: @global_atomic_fadd_double_div_address_div_value_system_scope_strictfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4
+; IR-NEXT: ret double [[RESULT]]
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4
+ ret double %result
+}
+
attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
attributes #2 = { strictfp }
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
index 76ec1cc..99d02ff 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i32_system.ll
@@ -358,65 +358,6 @@ define amdgpu_gfx i32 @global_atomic_xchg_i32_ret_offset_scalar(ptr addrspace(1)
; ---------------------------------------------------------------------
define void @global_atomic_xchg_f32_noret(ptr addrspace(1) %ptr, float %in) {
-; GCN1-LABEL: global_atomic_xchg_f32_noret:
-; GCN1: ; %bb.0:
-; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: global_load_dword v3, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
-; GCN1-NEXT: .LBB0_1: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: v_mov_b32_e32 v3, v4
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB0_1
-; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f32_noret:
-; GCN2: ; %bb.0:
-; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: global_load_dword v3, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
-; GCN2-NEXT: .LBB0_1: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: v_mov_b32_e32 v3, v4
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB0_1
-; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f32_noret:
-; GCN3: ; %bb.0:
-; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: global_load_dword v3, v[0:1]
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
-; GCN3-NEXT: .LBB0_1: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v3, v4
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB0_1
-; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT: s_setpc_b64 s[30:31]
; SI-LABEL: global_atomic_xchg_f32_noret:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -450,69 +391,6 @@ define void @global_atomic_xchg_f32_noret(ptr addrspace(1) %ptr, float %in) {
}
define void @global_atomic_xchg_f32_noret_offset(ptr addrspace(1) %out, float %in) {
-; GCN1-LABEL: global_atomic_xchg_f32_noret_offset:
-; GCN1: ; %bb.0:
-; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_add_f32_e32 v0, vcc, 16, v0
-; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN1-NEXT: global_load_dword v3, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
-; GCN1-NEXT: .LBB1_1: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: v_mov_b32_e32 v3, v4
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB1_1
-; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f32_noret_offset:
-; GCN2: ; %bb.0:
-; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
-; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN2-NEXT: global_load_dword v3, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
-; GCN2-NEXT: .LBB1_1: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: v_mov_b32_e32 v3, v4
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB1_1
-; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f32_noret_offset:
-; GCN3: ; %bb.0:
-; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: global_load_dword v3, v[0:1] offset:16
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
-; GCN3-NEXT: .LBB1_1: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v3, v4
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB1_1
-; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT: s_setpc_b64 s[30:31]
; SI-LABEL: global_atomic_xchg_f32_noret_offset:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -549,71 +427,6 @@ define void @global_atomic_xchg_f32_noret_offset(ptr addrspace(1) %out, float %i
}
define float @global_atomic_xchg_f32_ret(ptr addrspace(1) %ptr, float %in) {
-; GCN1-LABEL: global_atomic_xchg_f32_ret:
-; GCN1: ; %bb.0:
-; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: global_load_dword v4, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
-; GCN1-NEXT: .LBB2_1: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v3, v4
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB2_1
-; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT: v_mov_b32_e32 v0, v4
-; GCN1-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f32_ret:
-; GCN2: ; %bb.0:
-; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: global_load_dword v4, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
-; GCN2-NEXT: .LBB2_1: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v3, v4
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB2_1
-; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT: v_mov_b32_e32 v0, v4
-; GCN2-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f32_ret:
-; GCN3: ; %bb.0:
-; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: global_load_dword v4, v[0:1]
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
-; GCN3-NEXT: .LBB2_1: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v3, v4
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB2_1
-; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v0, v4
-; GCN3-NEXT: s_setpc_b64 s[30:31]
; SI-LABEL: global_atomic_xchg_f32_ret:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -648,73 +461,6 @@ define float @global_atomic_xchg_f32_ret(ptr addrspace(1) %ptr, float %in) {
}
define float @global_atomic_xchg_f32_ret_offset(ptr addrspace(1) %out, float %in) {
-; GCN1-LABEL: global_atomic_xchg_f32_ret_offset:
-; GCN1: ; %bb.0:
-; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_add_f32_e32 v4, vcc, 16, v0
-; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GCN1-NEXT: global_load_dword v0, v[4:5]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
-; GCN1-NEXT: .LBB3_1: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v3, v0
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: global_atomic_cmpswap v0, v[4:5], v[2:3] glc
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB3_1
-; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f32_ret_offset:
-; GCN2: ; %bb.0:
-; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_add_u32_e32 v4, vcc, 16, v0
-; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GCN2-NEXT: global_load_dword v0, v[4:5]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
-; GCN2-NEXT: .LBB3_1: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v3, v0
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: global_atomic_cmpswap v0, v[4:5], v[2:3] glc
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB3_1
-; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f32_ret_offset:
-; GCN3: ; %bb.0:
-; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: global_load_dword v4, v[0:1] offset:16
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
-; GCN3-NEXT: .LBB3_1: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v3, v4
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB3_1
-; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v0, v4
-; GCN3-NEXT: s_setpc_b64 s[30:31]
; SI-LABEL: global_atomic_xchg_f32_ret_offset:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -752,80 +498,6 @@ define float @global_atomic_xchg_f32_ret_offset(ptr addrspace(1) %out, float %in
}
define amdgpu_gfx void @global_atomic_xchg_f32_noret_scalar(ptr addrspace(1) inreg %ptr, float inreg %in) {
-; GCN1-LABEL: global_atomic_xchg_f32_noret_scalar:
-; GCN1: ; %bb.0:
-; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: global_load_dword v1, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[34:35], 0
-; GCN1-NEXT: .LBB4_1: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
-; GCN1-NEXT: v_mov_b32_e32 v0, s6
-; GCN1-NEXT: v_mov_b32_e32 v3, s5
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
-; GCN1-NEXT: s_cbranch_execnz .LBB4_1
-; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
-; GCN1-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f32_noret_scalar:
-; GCN2: ; %bb.0:
-; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: global_load_dword v1, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[34:35], 0
-; GCN2-NEXT: .LBB4_1: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
-; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: v_mov_b32_e32 v3, s5
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
-; GCN2-NEXT: s_cbranch_execnz .LBB4_1
-; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
-; GCN2-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f32_noret_scalar:
-; GCN3: ; %bb.0:
-; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: global_load_dword v1, v[0:1]
-; GCN3-NEXT: s_mov_b64 s[34:35], 0
-; GCN3-NEXT: .LBB4_1: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: v_mov_b32_e32 v2, s4
-; GCN3-NEXT: v_mov_b32_e32 v0, s6
-; GCN3-NEXT: v_mov_b32_e32 v3, s5
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
-; GCN3-NEXT: s_cbranch_execnz .LBB4_1
-; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
-; GCN3-NEXT: s_setpc_b64 s[30:31]
; SI-LABEL: global_atomic_xchg_f32_noret_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -876,84 +548,6 @@ define amdgpu_gfx void @global_atomic_xchg_f32_noret_scalar(ptr addrspace(1) inr
}
define amdgpu_gfx void @global_atomic_xchg_f32_noret_offset_scalar(ptr addrspace(1) inreg %out, float inreg %in) {
-; GCN1-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
-; GCN1: ; %bb.0:
-; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: s_add_u32 s34, s4, 16
-; GCN1-NEXT: s_addc_u32 s35, s5, 0
-; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: global_load_dword v1, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[36:37], 0
-; GCN1-NEXT: .LBB5_1: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: v_mov_b32_e32 v2, s34
-; GCN1-NEXT: v_mov_b32_e32 v0, s6
-; GCN1-NEXT: v_mov_b32_e32 v3, s35
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
-; GCN1-NEXT: s_cbranch_execnz .LBB5_1
-; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
-; GCN1-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
-; GCN2: ; %bb.0:
-; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s34, s4, 16
-; GCN2-NEXT: s_addc_u32 s35, s5, 0
-; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: global_load_dword v1, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[36:37], 0
-; GCN2-NEXT: .LBB5_1: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: v_mov_b32_e32 v2, s34
-; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: v_mov_b32_e32 v3, s35
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
-; GCN2-NEXT: s_cbranch_execnz .LBB5_1
-; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
-; GCN2-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
-; GCN3: ; %bb.0:
-; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: global_load_dword v1, v[0:1] offset:16
-; GCN3-NEXT: s_mov_b64 s[34:35], 0
-; GCN3-NEXT: .LBB5_1: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: v_mov_b32_e32 v2, s4
-; GCN3-NEXT: v_mov_b32_e32 v0, s6
-; GCN3-NEXT: v_mov_b32_e32 v3, s5
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
-; GCN3-NEXT: s_cbranch_execnz .LBB5_1
-; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
-; GCN3-NEXT: s_setpc_b64 s[30:31]
; SI-LABEL: global_atomic_xchg_f32_noret_offset_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1007,83 +601,6 @@ define amdgpu_gfx void @global_atomic_xchg_f32_noret_offset_scalar(ptr addrspace
}
define amdgpu_gfx float @global_atomic_xchg_f32_ret_scalar(ptr addrspace(1) inreg %ptr, float inreg %in) {
-; GCN1-LABEL: global_atomic_xchg_f32_ret_scalar:
-; GCN1: ; %bb.0:
-; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: global_load_dword v0, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[34:35], 0
-; GCN1-NEXT: .LBB6_1: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: v_mov_b32_e32 v3, s4
-; GCN1-NEXT: v_mov_b32_e32 v1, s6
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v2, v0
-; GCN1-NEXT: v_mov_b32_e32 v4, s5
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
-; GCN1-NEXT: s_cbranch_execnz .LBB6_1
-; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
-; GCN1-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f32_ret_scalar:
-; GCN2: ; %bb.0:
-; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: global_load_dword v0, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[34:35], 0
-; GCN2-NEXT: .LBB6_1: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: v_mov_b32_e32 v3, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s6
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, v0
-; GCN2-NEXT: v_mov_b32_e32 v4, s5
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
-; GCN2-NEXT: s_cbranch_execnz .LBB6_1
-; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
-; GCN2-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f32_ret_scalar:
-; GCN3: ; %bb.0:
-; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: global_load_dword v0, v[0:1]
-; GCN3-NEXT: s_mov_b64 s[34:35], 0
-; GCN3-NEXT: .LBB6_1: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: v_mov_b32_e32 v3, s4
-; GCN3-NEXT: v_mov_b32_e32 v1, s6
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v2, v0
-; GCN3-NEXT: v_mov_b32_e32 v4, s5
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
-; GCN3-NEXT: s_cbranch_execnz .LBB6_1
-; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
-; GCN3-NEXT: s_setpc_b64 s[30:31]
; SI-LABEL: global_atomic_xchg_f32_ret_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1134,87 +651,6 @@ define amdgpu_gfx float @global_atomic_xchg_f32_ret_scalar(ptr addrspace(1) inre
}
define amdgpu_gfx float @global_atomic_xchg_f32_ret_offset_scalar(ptr addrspace(1) inreg %out, float inreg %in) {
-; GCN1-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
-; GCN1: ; %bb.0:
-; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: s_add_u32 s34, s4, 16
-; GCN1-NEXT: s_addc_u32 s35, s5, 0
-; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: global_load_dword v0, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[36:37], 0
-; GCN1-NEXT: .LBB7_1: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: v_mov_b32_e32 v3, s34
-; GCN1-NEXT: v_mov_b32_e32 v1, s6
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v2, v0
-; GCN1-NEXT: v_mov_b32_e32 v4, s35
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
-; GCN1-NEXT: s_cbranch_execnz .LBB7_1
-; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
-; GCN1-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
-; GCN2: ; %bb.0:
-; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s34, s4, 16
-; GCN2-NEXT: s_addc_u32 s35, s5, 0
-; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: global_load_dword v0, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[36:37], 0
-; GCN2-NEXT: .LBB7_1: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: v_mov_b32_e32 v3, s34
-; GCN2-NEXT: v_mov_b32_e32 v1, s6
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, v0
-; GCN2-NEXT: v_mov_b32_e32 v4, s35
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
-; GCN2-NEXT: s_cbranch_execnz .LBB7_1
-; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
-; GCN2-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
-; GCN3: ; %bb.0:
-; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: global_load_dword v0, v[0:1] offset:16
-; GCN3-NEXT: s_mov_b64 s[34:35], 0
-; GCN3-NEXT: .LBB7_1: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: v_mov_b32_e32 v3, s4
-; GCN3-NEXT: v_mov_b32_e32 v1, s6
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v2, v0
-; GCN3-NEXT: v_mov_b32_e32 v4, s5
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] offset:16 glc
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
-; GCN3-NEXT: s_cbranch_execnz .LBB7_1
-; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
-; GCN3-NEXT: s_setpc_b64 s[30:31]
; SI-LABEL: global_atomic_xchg_f32_ret_offset_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
index d137f47..380ce7f 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_i64_system.ll
@@ -372,65 +372,6 @@ define amdgpu_gfx i64 @global_atomic_xchg_i64_ret_offset_scalar(ptr addrspace(1)
; ---------------------------------------------------------------------
define void @global_atomic_xchg_f64_noret(ptr addrspace(1) %ptr, double %in) {
-; GCN1-LABEL: global_atomic_xchg_f64_noret:
-; GCN1: ; %bb.0:
-; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: global_load_dword v3, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
-; GCN1-NEXT: .LBB0_1: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: v_mov_b32_e32 v3, v4
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB0_1
-; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f64_noret:
-; GCN2: ; %bb.0:
-; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: global_load_dword v3, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
-; GCN2-NEXT: .LBB0_1: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: v_mov_b32_e32 v3, v4
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB0_1
-; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f64_noret:
-; GCN3: ; %bb.0:
-; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: global_load_dword v3, v[0:1]
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
-; GCN3-NEXT: .LBB0_1: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v3, v4
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB0_1
-; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT: s_setpc_b64 s[30:31]
; SI-LABEL: global_atomic_xchg_f64_noret:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -464,69 +405,6 @@ define void @global_atomic_xchg_f64_noret(ptr addrspace(1) %ptr, double %in) {
}
define void @global_atomic_xchg_f64_noret_offset(ptr addrspace(1) %out, double %in) {
-; GCN1-LABEL: global_atomic_xchg_f64_noret_offset:
-; GCN1: ; %bb.0:
-; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_add_f64_e32 v0, vcc, 16, v0
-; GCN1-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN1-NEXT: global_load_dword v3, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
-; GCN1-NEXT: .LBB1_1: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: v_mov_b32_e32 v3, v4
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB1_1
-; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f64_noret_offset:
-; GCN2: ; %bb.0:
-; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_add_u32_e32 v0, vcc, 16, v0
-; GCN2-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GCN2-NEXT: global_load_dword v3, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
-; GCN2-NEXT: .LBB1_1: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: v_mov_b32_e32 v3, v4
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB1_1
-; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f64_noret_offset:
-; GCN3: ; %bb.0:
-; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: global_load_dword v3, v[0:1] offset:16
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
-; GCN3-NEXT: .LBB1_1: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v3, v4
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB1_1
-; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT: s_setpc_b64 s[30:31]
; SI-LABEL: global_atomic_xchg_f64_noret_offset:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -563,71 +441,6 @@ define void @global_atomic_xchg_f64_noret_offset(ptr addrspace(1) %out, double %
}
define double @global_atomic_xchg_f64_ret(ptr addrspace(1) %ptr, double %in) {
-; GCN1-LABEL: global_atomic_xchg_f64_ret:
-; GCN1: ; %bb.0:
-; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: global_load_dword v4, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
-; GCN1-NEXT: .LBB2_1: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v3, v4
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB2_1
-; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT: v_mov_b32_e32 v0, v4
-; GCN1-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f64_ret:
-; GCN2: ; %bb.0:
-; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: global_load_dword v4, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
-; GCN2-NEXT: .LBB2_1: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v3, v4
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB2_1
-; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT: v_mov_b32_e32 v0, v4
-; GCN2-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f64_ret:
-; GCN3: ; %bb.0:
-; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: global_load_dword v4, v[0:1]
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
-; GCN3-NEXT: .LBB2_1: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v3, v4
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] glc
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB2_1
-; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v0, v4
-; GCN3-NEXT: s_setpc_b64 s[30:31]
; SI-LABEL: global_atomic_xchg_f64_ret:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -663,73 +476,6 @@ define double @global_atomic_xchg_f64_ret(ptr addrspace(1) %ptr, double %in) {
}
define double @global_atomic_xchg_f64_ret_offset(ptr addrspace(1) %out, double %in) {
-; GCN1-LABEL: global_atomic_xchg_f64_ret_offset:
-; GCN1: ; %bb.0:
-; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_add_f64_e32 v4, vcc, 16, v0
-; GCN1-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GCN1-NEXT: global_load_dword v0, v[4:5]
-; GCN1-NEXT: s_mov_b64 s[4:5], 0
-; GCN1-NEXT: .LBB3_1: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v3, v0
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: global_atomic_cmpswap v0, v[4:5], v[2:3] glc
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
-; GCN1-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_cbranch_execnz .LBB3_1
-; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN1-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f64_ret_offset:
-; GCN2: ; %bb.0:
-; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_add_u32_e32 v4, vcc, 16, v0
-; GCN2-NEXT: v_addc_u32_e32 v5, vcc, 0, v1, vcc
-; GCN2-NEXT: global_load_dword v0, v[4:5]
-; GCN2-NEXT: s_mov_b64 s[4:5], 0
-; GCN2-NEXT: .LBB3_1: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v3, v0
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: global_atomic_cmpswap v0, v[4:5], v[2:3] glc
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v3
-; GCN2-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_cbranch_execnz .LBB3_1
-; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN2-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f64_ret_offset:
-; GCN3: ; %bb.0:
-; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: global_load_dword v4, v[0:1] offset:16
-; GCN3-NEXT: s_mov_b64 s[4:5], 0
-; GCN3-NEXT: .LBB3_1: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v3, v4
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: global_atomic_cmpswap v4, v[0:1], v[2:3] offset:16 glc
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v4, v3
-; GCN3-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[4:5]
-; GCN3-NEXT: s_cbranch_execnz .LBB3_1
-; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[4:5]
-; GCN3-NEXT: v_mov_b32_e32 v0, v4
-; GCN3-NEXT: s_setpc_b64 s[30:31]
; SI-LABEL: global_atomic_xchg_f64_ret_offset:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -768,80 +514,6 @@ define double @global_atomic_xchg_f64_ret_offset(ptr addrspace(1) %out, double %
}
define amdgpu_gfx void @global_atomic_xchg_f64_noret_scalar(ptr addrspace(1) inreg %ptr, double inreg %in) {
-; GCN1-LABEL: global_atomic_xchg_f64_noret_scalar:
-; GCN1: ; %bb.0:
-; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: global_load_dword v1, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[34:35], 0
-; GCN1-NEXT: .LBB4_1: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: v_mov_b32_e32 v2, s4
-; GCN1-NEXT: v_mov_b32_e32 v0, s6
-; GCN1-NEXT: v_mov_b32_e32 v3, s5
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
-; GCN1-NEXT: s_cbranch_execnz .LBB4_1
-; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
-; GCN1-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f64_noret_scalar:
-; GCN2: ; %bb.0:
-; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: global_load_dword v1, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[34:35], 0
-; GCN2-NEXT: .LBB4_1: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: v_mov_b32_e32 v2, s4
-; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: v_mov_b32_e32 v3, s5
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
-; GCN2-NEXT: s_cbranch_execnz .LBB4_1
-; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
-; GCN2-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f64_noret_scalar:
-; GCN3: ; %bb.0:
-; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: global_load_dword v1, v[0:1]
-; GCN3-NEXT: s_mov_b64 s[34:35], 0
-; GCN3-NEXT: .LBB4_1: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: v_mov_b32_e32 v2, s4
-; GCN3-NEXT: v_mov_b32_e32 v0, s6
-; GCN3-NEXT: v_mov_b32_e32 v3, s5
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
-; GCN3-NEXT: s_cbranch_execnz .LBB4_1
-; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
-; GCN3-NEXT: s_setpc_b64 s[30:31]
; SI-LABEL: global_atomic_xchg_f64_noret_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -896,84 +568,6 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_scalar(ptr addrspace(1) inr
}
define amdgpu_gfx void @global_atomic_xchg_f64_noret_offset_scalar(ptr addrspace(1) inreg %out, double inreg %in) {
-; GCN1-LABEL: global_atomic_xchg_f64_noret_offset_scalar:
-; GCN1: ; %bb.0:
-; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: s_add_u32 s34, s4, 16
-; GCN1-NEXT: s_addc_u32 s35, s5, 0
-; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: global_load_dword v1, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[36:37], 0
-; GCN1-NEXT: .LBB5_1: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: v_mov_b32_e32 v2, s34
-; GCN1-NEXT: v_mov_b32_e32 v0, s6
-; GCN1-NEXT: v_mov_b32_e32 v3, s35
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT: v_mov_b32_e32 v1, v0
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
-; GCN1-NEXT: s_cbranch_execnz .LBB5_1
-; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
-; GCN1-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f64_noret_offset_scalar:
-; GCN2: ; %bb.0:
-; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s34, s4, 16
-; GCN2-NEXT: s_addc_u32 s35, s5, 0
-; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: global_load_dword v1, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[36:37], 0
-; GCN2-NEXT: .LBB5_1: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: v_mov_b32_e32 v2, s34
-; GCN2-NEXT: v_mov_b32_e32 v0, s6
-; GCN2-NEXT: v_mov_b32_e32 v3, s35
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] glc
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT: v_mov_b32_e32 v1, v0
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
-; GCN2-NEXT: s_cbranch_execnz .LBB5_1
-; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
-; GCN2-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f64_noret_offset_scalar:
-; GCN3: ; %bb.0:
-; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: global_load_dword v1, v[0:1] offset:16
-; GCN3-NEXT: s_mov_b64 s[34:35], 0
-; GCN3-NEXT: .LBB5_1: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: v_mov_b32_e32 v2, s4
-; GCN3-NEXT: v_mov_b32_e32 v0, s6
-; GCN3-NEXT: v_mov_b32_e32 v3, s5
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: global_atomic_cmpswap v0, v[2:3], v[0:1] offset:16 glc
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v1
-; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT: v_mov_b32_e32 v1, v0
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
-; GCN3-NEXT: s_cbranch_execnz .LBB5_1
-; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
-; GCN3-NEXT: s_setpc_b64 s[30:31]
; SI-LABEL: global_atomic_xchg_f64_noret_offset_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1029,83 +623,6 @@ define amdgpu_gfx void @global_atomic_xchg_f64_noret_offset_scalar(ptr addrspace
}
define amdgpu_gfx double @global_atomic_xchg_f64_ret_scalar(ptr addrspace(1) inreg %ptr, double inreg %in) {
-; GCN1-LABEL: global_atomic_xchg_f64_ret_scalar:
-; GCN1: ; %bb.0:
-; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v0, s4
-; GCN1-NEXT: v_mov_b32_e32 v1, s5
-; GCN1-NEXT: global_load_dword v0, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[34:35], 0
-; GCN1-NEXT: .LBB6_1: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: v_mov_b32_e32 v3, s4
-; GCN1-NEXT: v_mov_b32_e32 v1, s6
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v2, v0
-; GCN1-NEXT: v_mov_b32_e32 v4, s5
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN1-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[34:35]
-; GCN1-NEXT: s_cbranch_execnz .LBB6_1
-; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[34:35]
-; GCN1-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f64_ret_scalar:
-; GCN2: ; %bb.0:
-; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v0, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s5
-; GCN2-NEXT: global_load_dword v0, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[34:35], 0
-; GCN2-NEXT: .LBB6_1: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: v_mov_b32_e32 v3, s4
-; GCN2-NEXT: v_mov_b32_e32 v1, s6
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, v0
-; GCN2-NEXT: v_mov_b32_e32 v4, s5
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN2-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[34:35]
-; GCN2-NEXT: s_cbranch_execnz .LBB6_1
-; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[34:35]
-; GCN2-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f64_ret_scalar:
-; GCN3: ; %bb.0:
-; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: global_load_dword v0, v[0:1]
-; GCN3-NEXT: s_mov_b64 s[34:35], 0
-; GCN3-NEXT: .LBB6_1: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: v_mov_b32_e32 v3, s4
-; GCN3-NEXT: v_mov_b32_e32 v1, s6
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v2, v0
-; GCN3-NEXT: v_mov_b32_e32 v4, s5
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
-; GCN3-NEXT: s_cbranch_execnz .LBB6_1
-; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
-; GCN3-NEXT: s_setpc_b64 s[30:31]
; SI-LABEL: global_atomic_xchg_f64_ret_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
@@ -1160,87 +677,6 @@ define amdgpu_gfx double @global_atomic_xchg_f64_ret_scalar(ptr addrspace(1) inr
}
define amdgpu_gfx double @global_atomic_xchg_f64_ret_offset_scalar(ptr addrspace(1) inreg %out, double inreg %in) {
-; GCN1-LABEL: global_atomic_xchg_f64_ret_offset_scalar:
-; GCN1: ; %bb.0:
-; GCN1-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN1-NEXT: s_add_u32 s34, s4, 16
-; GCN1-NEXT: s_addc_u32 s35, s5, 0
-; GCN1-NEXT: v_mov_b32_e32 v0, s34
-; GCN1-NEXT: v_mov_b32_e32 v1, s35
-; GCN1-NEXT: global_load_dword v0, v[0:1]
-; GCN1-NEXT: s_mov_b64 s[36:37], 0
-; GCN1-NEXT: .LBB7_1: ; %atomicrmw.start
-; GCN1-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN1-NEXT: v_mov_b32_e32 v3, s34
-; GCN1-NEXT: v_mov_b32_e32 v1, s6
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: v_mov_b32_e32 v2, v0
-; GCN1-NEXT: v_mov_b32_e32 v4, s35
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN1-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN1-NEXT: buffer_wbinvl1_vol
-; GCN1-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN1-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN1-NEXT: s_andn2_b64 exec, exec, s[36:37]
-; GCN1-NEXT: s_cbranch_execnz .LBB7_1
-; GCN1-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN1-NEXT: s_or_b64 exec, exec, s[36:37]
-; GCN1-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN2-LABEL: global_atomic_xchg_f64_ret_offset_scalar:
-; GCN2: ; %bb.0:
-; GCN2-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN2-NEXT: s_add_u32 s34, s4, 16
-; GCN2-NEXT: s_addc_u32 s35, s5, 0
-; GCN2-NEXT: v_mov_b32_e32 v0, s34
-; GCN2-NEXT: v_mov_b32_e32 v1, s35
-; GCN2-NEXT: global_load_dword v0, v[0:1]
-; GCN2-NEXT: s_mov_b64 s[36:37], 0
-; GCN2-NEXT: .LBB7_1: ; %atomicrmw.start
-; GCN2-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN2-NEXT: v_mov_b32_e32 v3, s34
-; GCN2-NEXT: v_mov_b32_e32 v1, s6
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: v_mov_b32_e32 v2, v0
-; GCN2-NEXT: v_mov_b32_e32 v4, s35
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] glc
-; GCN2-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN2-NEXT: buffer_wbinvl1_vol
-; GCN2-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN2-NEXT: s_or_b64 s[36:37], vcc, s[36:37]
-; GCN2-NEXT: s_andn2_b64 exec, exec, s[36:37]
-; GCN2-NEXT: s_cbranch_execnz .LBB7_1
-; GCN2-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN2-NEXT: s_or_b64 exec, exec, s[36:37]
-; GCN2-NEXT: s_setpc_b64 s[30:31]
-;
-; GCN3-LABEL: global_atomic_xchg_f64_ret_offset_scalar:
-; GCN3: ; %bb.0:
-; GCN3-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v0, s4
-; GCN3-NEXT: v_mov_b32_e32 v1, s5
-; GCN3-NEXT: global_load_dword v0, v[0:1] offset:16
-; GCN3-NEXT: s_mov_b64 s[34:35], 0
-; GCN3-NEXT: .LBB7_1: ; %atomicrmw.start
-; GCN3-NEXT: ; =>This Inner Loop Header: Depth=1
-; GCN3-NEXT: v_mov_b32_e32 v3, s4
-; GCN3-NEXT: v_mov_b32_e32 v1, s6
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: v_mov_b32_e32 v2, v0
-; GCN3-NEXT: v_mov_b32_e32 v4, s5
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: global_atomic_cmpswap v0, v[3:4], v[1:2] offset:16 glc
-; GCN3-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0)
-; GCN3-NEXT: buffer_wbinvl1_vol
-; GCN3-NEXT: v_cmp_eq_u32_e32 vcc, v0, v2
-; GCN3-NEXT: s_or_b64 s[34:35], vcc, s[34:35]
-; GCN3-NEXT: s_andn2_b64 exec, exec, s[34:35]
-; GCN3-NEXT: s_cbranch_execnz .LBB7_1
-; GCN3-NEXT: ; %bb.2: ; %atomicrmw.end
-; GCN3-NEXT: s_or_b64 exec, exec, s[34:35]
-; GCN3-NEXT: s_setpc_b64 s[30:31]
; SI-LABEL: global_atomic_xchg_f64_ret_offset_scalar:
; SI: ; %bb.0:
; SI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
index fab24e1..86e3d93 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_iterative_scan_fp.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-- -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR-ITERATIVE %s
-; RUN: opt -S -mtriple=amdgcn-- -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR-DPP %s
+; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-atomic-optimizer<strategy=iterative>,verify<domtree>' %s | FileCheck -check-prefix=IR-ITERATIVE %s
+; RUN: opt -S -mtriple=amdgcn-- -passes='amdgpu-atomic-optimizer<strategy=dpp>,verify<domtree>' %s | FileCheck -check-prefix=IR-DPP %s
declare i32 @llvm.amdgcn.workitem.id.x()
define amdgpu_kernel void @global_atomic_fadd_uni_value(ptr addrspace(1) %ptr) #0 {
; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_value(
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
index f87932b..b9234f4 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_optimizer_fp_no_rtn.ll
@@ -1,55 +1,35 @@
; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=Iterative -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR-ITERATIVE %s
-; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -amdgpu-atomic-optimizer-strategy=DPP -passes='amdgpu-atomic-optimizer,verify<domtree>' %s | FileCheck -check-prefix=IR-DPP %s
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -passes='amdgpu-atomic-optimizer<strategy=iterative>,verify<domtree>' %s | FileCheck --check-prefixes=IR,IR-ITERATIVE %s
+; RUN: opt -S -mtriple=amdgcn-- -mcpu=gfx906 -passes='amdgpu-atomic-optimizer<strategy=dpp>,verify<domtree>' %s | FileCheck --check-prefixes=IR,IR-DPP %s
+
+; Tests various combinations of uniform/divergent address and uniform/divergent value inputs of various types for atomic operations.
+; Optimization remains same for Iterative and DPP strategies when value in uniform. These different scan/reduction
+; strategies are valid for only divergent values. This optimization is valid for divergent addresses. Test also covers different scopes.
define amdgpu_ps void @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(
-; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]]
-; IR-ITERATIVE: 2:
-; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
-; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
-; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]])
-; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
-; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float
-; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]]
-; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
-; IR-ITERATIVE: 14:
-; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT: br label [[TMP16]]
-; IR-ITERATIVE: 16:
-; IR-ITERATIVE-NEXT: br label [[TMP17]]
-; IR-ITERATIVE: 17:
-; IR-ITERATIVE-NEXT: ret void
-;
-; IR-DPP-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(
-; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]]
-; IR-DPP: 2:
-; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
-; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
-; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]])
-; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
-; IR-DPP-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float
-; IR-DPP-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]]
-; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
-; IR-DPP: 14:
-; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT: br label [[TMP16]]
-; IR-DPP: 16:
-; IR-DPP-NEXT: br label [[TMP17]]
-; IR-DPP: 17:
-; IR-DPP-NEXT: ret void
+; IR-LABEL: @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(
+; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]]
+; IR: 2:
+; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]])
+; IR-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
+; IR-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to float
+; IR-NEXT: [[TMP12:%.*]] = fmul float [[VAL:%.*]], [[TMP11]]
+; IR-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
+; IR: 14:
+; IR-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[TMP12]] syncscope("agent") monotonic, align 4
+; IR-NEXT: br label [[TMP16]]
+; IR: 16:
+; IR-NEXT: br label [[TMP17]]
+; IR: 17:
+; IR-NEXT: ret void
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
ret void
@@ -325,7 +305,6 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_uni_value_agent_scope_stri
ret void
}
-
define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, float %val) #2 {
; IR-ITERATIVE-LABEL: @global_atomic_fsub_uni_address_div_value_agent_scope_strictfp(
; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
@@ -409,45 +388,25 @@ define amdgpu_ps void @global_atomic_fsub_uni_address_div_value_agent_scope_stri
}
define amdgpu_ps void @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, float inreg %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(
-; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
-; IR-ITERATIVE: 2:
-; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
-; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
-; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-ITERATIVE-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
-; IR-ITERATIVE: 10:
-; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT: br label [[TMP12]]
-; IR-ITERATIVE: 12:
-; IR-ITERATIVE-NEXT: br label [[TMP13]]
-; IR-ITERATIVE: 13:
-; IR-ITERATIVE-NEXT: ret void
-;
-; IR-DPP-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(
-; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
-; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
-; IR-DPP: 2:
-; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
-; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
-; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
-; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
-; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
-; IR-DPP-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
-; IR-DPP-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
-; IR-DPP: 10:
-; IR-DPP-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT: br label [[TMP12]]
-; IR-DPP: 12:
-; IR-DPP-NEXT: br label [[TMP13]]
-; IR-DPP: 13:
-; IR-DPP-NEXT: ret void
+; IR-LABEL: @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(
+; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
+; IR: 2:
+; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR: 10:
+; IR-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT: br label [[TMP12]]
+; IR: 12:
+; IR-NEXT: br label [[TMP13]]
+; IR: 13:
+; IR-NEXT: ret void
;
%result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
ret void
@@ -797,161 +756,531 @@ define amdgpu_ps void @global_atomic_fadd_uni_address_div_value_system_scope_str
ret void
}
-
define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float inreg %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(
-; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT: ret void
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(
-; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT: ret void
+; IR-LABEL: @global_atomic_fadd_div_address_uni_value_agent_scope_unsafe(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT: ret void
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
ret void
}
define amdgpu_ps void @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr, float %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(
-; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT: ret void
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(
-; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT: ret void
+; IR-LABEL: @global_atomic_fadd_div_address_div_value_agent_scope_unsafe(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT: ret void
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic, align 4
ret void
}
define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(
-; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
-; IR-ITERATIVE-NEXT: ret void
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(
-; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
-; IR-DPP-NEXT: ret void
+; IR-LABEL: @global_atomic_fadd_div_address_uni_value_one_as_scope_unsafe_structfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-NEXT: ret void
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
ret void
}
define amdgpu_ps void @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(
-; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
-; IR-ITERATIVE-NEXT: ret void
-;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(
-; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
-; IR-DPP-NEXT: ret void
+; IR-LABEL: @global_atomic_fadd_div_address_div_value_one_as_scope_unsafe_structfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("one-as") monotonic, align 4
+; IR-NEXT: ret void
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("one-as") monotonic
ret void
}
define amdgpu_ps void @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 {
-; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(
-; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT: ret void
-;
-; IR-DPP-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(
-; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT: ret void
+; IR-LABEL: @global_atomic_fsub_div_address_uni_value_agent_scope_strictfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT: ret void
;
%result = atomicrmw fadd ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
ret void
}
-
define amdgpu_ps void @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 {
-; IR-ITERATIVE-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(
-; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT: ret void
-;
-; IR-DPP-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(
-; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT: ret void
+; IR-LABEL: @global_atomic_fsub_div_address_div_value_agent_scope_strictfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT: ret void
;
%result = atomicrmw fsub ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
ret void
}
define amdgpu_ps void @global_atomic_fmin_div_address_uni_value_agent_scope(ptr addrspace(1) %ptr, float inreg %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope(
-; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT: ret void
-;
-; IR-DPP-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope(
-; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT: ret void
+; IR-LABEL: @global_atomic_fmin_div_address_uni_value_agent_scope(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT: ret void
;
%result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
ret void
}
define amdgpu_ps void @global_atomic_fmin_div_address_div_value_agent_scope(ptr addrspace(1) %ptr, float %val) #0 {
-; IR-ITERATIVE-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope(
-; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-ITERATIVE-NEXT: ret void
-;
-; IR-DPP-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope(
-; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
-; IR-DPP-NEXT: ret void
+; IR-LABEL: @global_atomic_fmin_div_address_div_value_agent_scope(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT: ret void
;
%result = atomicrmw fmin ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
ret void
}
define amdgpu_ps void @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float inreg %val) #1{
-; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(
-; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT: ret void
+;
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+ ret void
+}
+
+define amdgpu_ps void @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1{
+; IR-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT: ret void
+;
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+ ret void
+}
+
+define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 {
+; IR-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-NEXT: ret void
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4
+ ret void
+}
+
+define amdgpu_ps void @global_atomic_fadd_div_address_div_value_system_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 {
+; IR-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-NEXT: ret void
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4
+ ret void
+}
+
+define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double inreg %val) #0 {
+; IR-LABEL: @global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe(
+; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]]
+; IR: 2:
+; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]])
+; IR-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
+; IR-NEXT: [[TMP11:%.*]] = uitofp i32 [[TMP10]] to double
+; IR-NEXT: [[TMP12:%.*]] = fmul double [[VAL:%.*]], [[TMP11]]
+; IR-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
+; IR: 14:
+; IR-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("agent") monotonic, align 4
+; IR-NEXT: br label [[TMP16]]
+; IR: 16:
+; IR-NEXT: br label [[TMP17]]
+; IR: 17:
+; IR-NEXT: ret void
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4
+ ret void
+}
+
+define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 {
+; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_scope_agent_scope_unsafe(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT: ret void
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4
+ ret void
+}
+
+define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double inreg %val) #1 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
+; IR-ITERATIVE: 14:
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("one-as") monotonic, align 8
+; IR-ITERATIVE-NEXT: br label [[TMP16]]
+; IR-ITERATIVE: 16:
+; IR-ITERATIVE-NEXT: br label [[TMP17]]
+; IR-ITERATIVE: 17:
; IR-ITERATIVE-NEXT: ret void
;
-; IR-DPP-LABEL: @global_atomic_fmax_div_address_uni_value_agent_scope_unsafe_structfp(
-; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
+; IR-DPP: 14:
+; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("one-as") monotonic, align 8
+; IR-DPP-NEXT: br label [[TMP16]]
+; IR-DPP: 16:
+; IR-DPP-NEXT: br label [[TMP17]]
+; IR-DPP: 17:
; IR-DPP-NEXT: ret void
;
- %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic
ret void
}
-define amdgpu_ps void @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, float %val) #1{
-; IR-ITERATIVE-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(
-; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double %val) #1 {
+; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8
+; IR-NEXT: ret void
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic
+ ret void
+}
+
+define amdgpu_ps void @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, double inreg %val) #2 {
+; IR-ITERATIVE-LABEL: @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
+; IR-ITERATIVE: 14:
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("agent") monotonic, align 8
+; IR-ITERATIVE-NEXT: br label [[TMP16]]
+; IR-ITERATIVE: 16:
+; IR-ITERATIVE-NEXT: br label [[TMP17]]
+; IR-ITERATIVE: 17:
; IR-ITERATIVE-NEXT: ret void
;
-; IR-DPP-LABEL: @global_atomic_fmax_div_address_div_value_agent_scope_unsafe_structfp(
-; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-DPP-LABEL: @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
+; IR-DPP: 14:
+; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] syncscope("agent") monotonic, align 8
+; IR-DPP-NEXT: br label [[TMP16]]
+; IR-DPP: 16:
+; IR-DPP-NEXT: br label [[TMP17]]
+; IR-DPP: 17:
; IR-DPP-NEXT: ret void
;
- %result = atomicrmw fmax ptr addrspace(1) %ptr, float %val syncscope("agent") monotonic
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
ret void
}
-define amdgpu_ps void @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(ptr addrspace(1) %ptr, float inreg %val) #2 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(
-; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+define amdgpu_ps void @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 {
+; IR-LABEL: @global_atomic_fsub_double_uni_address_div_value_agent_scope_strictfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
+; IR-NEXT: ret void
+;
+ %result = atomicrmw fsub ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
+ ret void
+}
+
+define amdgpu_ps void @global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double inreg %val) #0 {
+; IR-LABEL: @global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe(
+; IR-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live()
+; IR-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
+; IR: 2:
+; IR-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true)
+; IR-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0)
+; IR-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]])
+; IR-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR: 10:
+; IR-NEXT: [[TMP11:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
+; IR-NEXT: br label [[TMP12]]
+; IR: 12:
+; IR-NEXT: br label [[TMP13]]
+; IR: 13:
+; IR-NEXT: ret void
+;
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
+ ret void
+}
+
+define amdgpu_ps void @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) inreg %ptr, double %val) #0 {
+; IR-LABEL: @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
+; IR-NEXT: ret void
+;
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
+ ret void
+}
+
+define amdgpu_ps void @global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double inreg %val) #1{
+; IR-ITERATIVE-LABEL: @global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe_structfp(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR-ITERATIVE: 10:
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
+; IR-ITERATIVE-NEXT: br label [[TMP12]]
+; IR-ITERATIVE: 12:
+; IR-ITERATIVE-NEXT: br label [[TMP13]]
+; IR-ITERATIVE: 13:
; IR-ITERATIVE-NEXT: ret void
;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_uni_value_system_scope_strictfp(
-; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-DPP-LABEL: @global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe_structfp(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP13:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP9:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP9]], label [[TMP10:%.*]], label [[TMP12:%.*]]
+; IR-DPP: 10:
+; IR-DPP-NEXT: [[TMP11:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
+; IR-DPP-NEXT: br label [[TMP12]]
+; IR-DPP: 12:
+; IR-DPP-NEXT: br label [[TMP13]]
+; IR-DPP: 13:
; IR-DPP-NEXT: ret void
;
- %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
ret void
}
-define amdgpu_ps void @global_atomic_fadd_div_address_div_value_system_scope_strictfp(ptr addrspace(1) %ptr, float %val) #2 {
-; IR-ITERATIVE-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp(
-; IR-ITERATIVE-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+define amdgpu_ps void @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) inreg %ptr, double %val) #1{
+; IR-LABEL: @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe_structfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
+; IR-NEXT: ret void
+;
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
+ ret void
+}
+
+define amdgpu_ps void @global_atomic_fadd_double_uni_address_uni_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, double inreg %val) #2 {
+; IR-ITERATIVE-LABEL: @global_atomic_fadd_double_uni_address_uni_value_system_scope_strictfp(
+; IR-ITERATIVE-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR7]]
+; IR-ITERATIVE-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]]
+; IR-ITERATIVE: 2:
+; IR-ITERATIVE-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-ITERATIVE-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-ITERATIVE-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-ITERATIVE-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
+; IR-ITERATIVE-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR7]]
+; IR-ITERATIVE-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-ITERATIVE-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
+; IR-ITERATIVE: 14:
+; IR-ITERATIVE-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] monotonic, align 4
+; IR-ITERATIVE-NEXT: br label [[TMP16]]
+; IR-ITERATIVE: 16:
+; IR-ITERATIVE-NEXT: br label [[TMP17]]
+; IR-ITERATIVE: 17:
; IR-ITERATIVE-NEXT: ret void
;
-; IR-DPP-LABEL: @global_atomic_fadd_div_address_div_value_system_scope_strictfp(
-; IR-DPP-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], float [[VAL:%.*]] monotonic, align 4
+; IR-DPP-LABEL: @global_atomic_fadd_double_uni_address_uni_value_system_scope_strictfp(
+; IR-DPP-NEXT: [[TMP1:%.*]] = call i1 @llvm.amdgcn.ps.live() #[[ATTR8]]
+; IR-DPP-NEXT: br i1 [[TMP1]], label [[TMP2:%.*]], label [[TMP17:%.*]]
+; IR-DPP: 2:
+; IR-DPP-NEXT: [[TMP3:%.*]] = call i64 @llvm.amdgcn.ballot.i64(i1 true) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; IR-DPP-NEXT: [[TMP5:%.*]] = lshr i64 [[TMP3]], 32
+; IR-DPP-NEXT: [[TMP6:%.*]] = trunc i64 [[TMP5]] to i32
+; IR-DPP-NEXT: [[TMP7:%.*]] = call i32 @llvm.amdgcn.mbcnt.lo(i32 [[TMP4]], i32 0) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP8:%.*]] = call i32 @llvm.amdgcn.mbcnt.hi(i32 [[TMP6]], i32 [[TMP7]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP9:%.*]] = call i64 @llvm.ctpop.i64(i64 [[TMP3]]) #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP10:%.*]] = trunc i64 [[TMP9]] to i32
+; IR-DPP-NEXT: [[TMP11:%.*]] = call double @llvm.experimental.constrained.uitofp.f64.i32(i32 [[TMP10]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP12:%.*]] = call double @llvm.experimental.constrained.fmul.f64(double [[VAL:%.*]], double [[TMP11]], metadata !"round.dynamic", metadata !"fpexcept.strict") #[[ATTR8]]
+; IR-DPP-NEXT: [[TMP13:%.*]] = icmp eq i32 [[TMP8]], 0
+; IR-DPP-NEXT: br i1 [[TMP13]], label [[TMP14:%.*]], label [[TMP16:%.*]]
+; IR-DPP: 14:
+; IR-DPP-NEXT: [[TMP15:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[TMP12]] monotonic, align 4
+; IR-DPP-NEXT: br label [[TMP16]]
+; IR-DPP: 16:
+; IR-DPP-NEXT: br label [[TMP17]]
+; IR-DPP: 17:
; IR-DPP-NEXT: ret void
;
- %result = atomicrmw fadd ptr addrspace(1) %ptr, float %val monotonic, align 4
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4
+ ret void
+}
+
+define amdgpu_ps void @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(ptr addrspace(1) inreg %ptr, double %val) #2 {
+; IR-LABEL: @global_atomic_fadd_double_uni_address_div_value_system_scope_strictfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4
+; IR-NEXT: ret void
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4
+ ret void
+}
+
+define amdgpu_ps void @global_atomic_fadd_double_div_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr, double inreg %val) #0 {
+; IR-LABEL: @global_atomic_fadd_double_div_address_uni_value_agent_scope_unsafe(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT: ret void
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4
+ ret void
+}
+
+define amdgpu_ps void @global_atomic_fadd_double_div_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr, double %val) #0 {
+; IR-LABEL: @global_atomic_fadd_double_div_address_div_value_agent_scope_unsafe(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 4
+; IR-NEXT: ret void
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic, align 4
+ ret void
+}
+
+define amdgpu_ps void @global_atomic_fadd_double_div_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, double inreg %val) #1 {
+; IR-LABEL: @global_atomic_fadd_double_div_address_uni_value_one_as_scope_unsafe_structfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8
+; IR-NEXT: ret void
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic
+ ret void
+}
+
+define amdgpu_ps void @global_atomic_fadd_double_div_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr, double %val) #1 {
+; IR-LABEL: @global_atomic_fadd_double_div_address_div_value_one_as_scope_unsafe_structfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("one-as") monotonic, align 8
+; IR-NEXT: ret void
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("one-as") monotonic
+ ret void
+}
+
+define amdgpu_ps void @global_atomic_fsub_double_div_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr, double inreg %val) #2 {
+; IR-LABEL: @global_atomic_fsub_double_div_address_uni_value_agent_scope_strictfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
+; IR-NEXT: ret void
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
+ ret void
+}
+
+define amdgpu_ps void @global_atomic_fsub_double_div_address_div_value_agent_scope_strictfp(ptr addrspace(1) %ptr, double %val) #2 {
+; IR-LABEL: @global_atomic_fsub_double_div_address_div_value_agent_scope_strictfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fsub ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
+; IR-NEXT: ret void
+;
+ %result = atomicrmw fsub ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
+ ret void
+}
+
+define amdgpu_ps void @global_atomic_fmin_double_div_address_uni_value_agent_scope(ptr addrspace(1) %ptr, double inreg %val) #0 {
+; IR-LABEL: @global_atomic_fmin_double_div_address_uni_value_agent_scope(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
+; IR-NEXT: ret void
+;
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
+ ret void
+}
+
+define amdgpu_ps void @global_atomic_fmin_double_div_address_div_value_agent_scope(ptr addrspace(1) %ptr, double %val) #0 {
+; IR-LABEL: @global_atomic_fmin_double_div_address_div_value_agent_scope(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmin ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
+; IR-NEXT: ret void
+;
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
+ ret void
+}
+
+define amdgpu_ps void @global_atomic_fmax_double_div_address_uni_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, double inreg %val) #1{
+; IR-LABEL: @global_atomic_fmax_double_div_address_uni_value_agent_scope_unsafe_structfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
+; IR-NEXT: ret void
+;
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
+ ret void
+}
+
+define amdgpu_ps void @global_atomic_fmax_double_div_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr, double %val) #1{
+; IR-LABEL: @global_atomic_fmax_double_div_address_div_value_agent_scope_unsafe_structfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fmax ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] syncscope("agent") monotonic, align 8
+; IR-NEXT: ret void
+;
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double %val syncscope("agent") monotonic
+ ret void
+}
+
+define amdgpu_ps void @global_atomic_fadd_double_div_address_uni_value_system_scope_strictfp(ptr addrspace(1) %ptr, double inreg %val) #2 {
+; IR-LABEL: @global_atomic_fadd_double_div_address_uni_value_system_scope_strictfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4
+; IR-NEXT: ret void
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4
+ ret void
+}
+
+define amdgpu_ps void @global_atomic_fadd_double_div_address_div_value_system_scope_strictfp(ptr addrspace(1) %ptr, double %val) #2 {
+; IR-LABEL: @global_atomic_fadd_double_div_address_div_value_system_scope_strictfp(
+; IR-NEXT: [[RESULT:%.*]] = atomicrmw fadd ptr addrspace(1) [[PTR:%.*]], double [[VAL:%.*]] monotonic, align 4
+; IR-NEXT: ret void
+;
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %val monotonic, align 4
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
index 96c615b..4f00d48 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fadd.ll
@@ -13,6 +13,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
declare float @div.float.value()
+declare double @div.double.value()
define amdgpu_kernel void @global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
; GFX7LESS-LABEL: global_atomic_fadd_uni_address_uni_value_agent_scope_unsafe:
@@ -5408,6 +5409,5583 @@ define amdgpu_kernel void @global_atomic_fadd_uni_address_div_value_defalut_scop
ret void
}
+define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
+; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s40, s40, s3
+; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s33, s2
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3
+; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_load_dwordx2 s[2:3], s[36:37], 0x0
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX7LESS-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3
+; GFX7LESS-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42]
+; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX7LESS-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
+; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX7LESS-NEXT: s_waitcnt expcnt(2)
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
+; GFX7LESS-NEXT: s_mov_b32 s12, s33
+; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2
+; GFX7LESS-NEXT: .LBB9_3:
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s42, -1
+; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: s_mov_b32 s43, 0xe00000
+; GFX9-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX9-NEXT: s_add_u32 s40, s40, s3
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX9-NEXT: s_addc_u32 s41, s41, 0
+; GFX9-NEXT: s_mov_b32 s33, s2
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_movk_i32 s32, 0x800
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB9_3
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX9-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX9-NEXT: s_mov_b32 s12, s33
+; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX9-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, s36
+; GFX9-NEXT: v_mov_b32_e32 v3, s37
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-NEXT: s_cbranch_execnz .LBB9_2
+; GFX9-NEXT: .LBB9_3:
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s42, -1
+; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s40, s40, s3
+; GFX1064-NEXT: s_mov_b32 s33, s2
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: s_addc_u32 s41, s41, 0
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1064-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_cbranch_execz .LBB9_3
+; GFX1064-NEXT: ; %bb.1:
+; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX1064-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1064-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v2, s1
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1064-NEXT: s_mov_b32 s12, s33
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: v_mov_b32_e32 v2, s36
+; GFX1064-NEXT: v_mov_b32_e32 v3, s37
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1064-NEXT: s_clause 0x1
+; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1064-NEXT: .LBB9_3:
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s33, s2
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032-NEXT: v_mov_b32_e32 v40, v0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s42, -1
+; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_add_u32 s40, s40, s3
+; GFX1032-NEXT: s_addc_u32 s41, s41, 0
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1032-NEXT: s_mov_b32 s38, 0
+; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB9_3
+; GFX1032-NEXT: ; %bb.1:
+; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s2
+; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1032-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v2, s1
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1032-NEXT: s_mov_b32 s12, s33
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-NEXT: v_mov_b32_e32 v2, s36
+; GFX1032-NEXT: v_mov_b32_e32 v3, s37
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1032-NEXT: s_clause 0x1
+; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1032-NEXT: .LBB9_3:
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b32 s33, s2
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1164-NEXT: s_mov_b32 s32, 32
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_cbranch_execz .LBB9_3
+; GFX1164-NEXT: ; %bb.1:
+; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX1164-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: v_mov_b32_e32 v2, s1
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-NEXT: s_mov_b32 s12, s33
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v2, s36
+; GFX1164-NEXT: v_mov_b32_e32 v3, s37
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1164-NEXT: .LBB9_3:
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: v_mov_b32_e32 v40, v0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1132-NEXT: s_mov_b32 s38, 0
+; GFX1132-NEXT: s_mov_b32 s32, 32
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_cbranch_execz .LBB9_3
+; GFX1132-NEXT: ; %bb.1:
+; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s2
+; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX1132-NEXT: s_mov_b32 s33, s15
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-NEXT: s_mov_b32 s12, s33
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
+; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1132-NEXT: .LBB9_3:
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s42, -1
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX9-DPP-NEXT: s_mov_b32 s33, s2
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX9-DPP-NEXT: s_mov_b32 s12, s33
+; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX9-DPP-NEXT: .LBB9_3:
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s42, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3
+; GFX1064-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1064-DPP-NEXT: s_clause 0x1
+; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1064-DPP-NEXT: .LBB9_3:
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s42, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3
+; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1032-DPP-NEXT: s_mov_b32 s38, 0
+; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1032-DPP-NEXT: s_clause 0x1
+; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1032-DPP-NEXT: .LBB9_3:
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1164-DPP-NEXT: .LBB9_3:
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1132-DPP-NEXT: s_mov_b32 s38, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s2
+; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1132-DPP-NEXT: .LBB9_3:
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-DPP-NEXT: s_endpgm
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
+; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s50, -1
+; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s48, s48, s9
+; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0
+; GFX7LESS-NEXT: s_mov_b32 s33, s8
+; GFX7LESS-NEXT: s_mov_b32 s40, s7
+; GFX7LESS-NEXT: s_mov_b32 s41, s6
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3]
+; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s46, -1
+; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
+; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX7LESS-NEXT: s_mov_b32 s12, s41
+; GFX7LESS-NEXT: s_mov_b32 s13, s40
+; GFX7LESS-NEXT: s_mov_b32 s14, s33
+; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0
+; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0
+; GFX7LESS-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41]
+; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
+; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
+; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: s_waitcnt expcnt(2)
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX7LESS-NEXT: s_mov_b32 s12, s41
+; GFX7LESS-NEXT: s_mov_b32 s13, s40
+; GFX7LESS-NEXT: s_mov_b32 s14, s33
+; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43]
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_1
+; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s50, -1
+; GFX9-NEXT: s_mov_b32 s51, 0xe00000
+; GFX9-NEXT: s_add_u32 s48, s48, s9
+; GFX9-NEXT: s_addc_u32 s49, s49, 0
+; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3]
+; GFX9-NEXT: s_mov_b32 s33, s8
+; GFX9-NEXT: s_add_u32 s8, s36, 44
+; GFX9-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX9-NEXT: s_mov_b32 s40, s7
+; GFX9-NEXT: s_mov_b32 s41, s6
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-NEXT: s_mov_b32 s12, s41
+; GFX9-NEXT: s_mov_b32 s13, s40
+; GFX9-NEXT: s_mov_b32 s14, s33
+; GFX9-NEXT: v_mov_b32_e32 v31, v42
+; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT: s_movk_i32 s32, 0x800
+; GFX9-NEXT: v_mov_b32_e32 v43, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v41, v1
+; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
+; GFX9-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX9-NEXT: s_add_u32 s8, s36, 44
+; GFX9-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
+; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-NEXT: s_mov_b32 s12, s41
+; GFX9-NEXT: s_mov_b32 s13, s40
+; GFX9-NEXT: s_mov_b32 s14, s33
+; GFX9-NEXT: v_mov_b32_e32 v31, v42
+; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, s42
+; GFX9-NEXT: v_mov_b32_e32 v3, s43
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0
+; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-NEXT: s_cbranch_execnz .LBB10_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s50, -1
+; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s48, s48, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1064-NEXT: s_addc_u32 s49, s49, 0
+; GFX1064-NEXT: s_mov_b32 s33, s8
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1064-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1064-NEXT: s_mov_b32 s40, s7
+; GFX1064-NEXT: s_mov_b32 s41, s6
+; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s12, s41
+; GFX1064-NEXT: s_mov_b32 s13, s40
+; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT: v_mov_b32_e32 v31, v42
+; GFX1064-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-NEXT: v_mov_b32_e32 v43, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-NEXT: v_mov_b32_e32 v41, v1
+; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
+; GFX1064-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v31, v42
+; GFX1064-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: v_mov_b32_e32 v2, s42
+; GFX1064-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-NEXT: s_mov_b32 s12, s41
+; GFX1064-NEXT: s_mov_b32 s13, s40
+; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
+; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1064-NEXT: v_mov_b32_e32 v3, s43
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-NEXT: s_clause 0x1
+; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0
+; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-NEXT: s_cbranch_execnz .LBB10_1
+; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s50, -1
+; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s48, s48, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1032-NEXT: s_addc_u32 s49, s49, 0
+; GFX1032-NEXT: s_mov_b32 s33, s8
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1032-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1032-NEXT: s_mov_b32 s40, s7
+; GFX1032-NEXT: s_mov_b32 s41, s6
+; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s12, s41
+; GFX1032-NEXT: s_mov_b32 s13, s40
+; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT: v_mov_b32_e32 v31, v42
+; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: v_mov_b32_e32 v43, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-NEXT: v_mov_b32_e32 v41, v1
+; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
+; GFX1032-NEXT: v_mov_b32_e32 v40, v0
+; GFX1032-NEXT: s_mov_b32 s44, 0
+; GFX1032-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v31, v42
+; GFX1032-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-NEXT: v_mov_b32_e32 v2, s42
+; GFX1032-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-NEXT: s_mov_b32 s12, s41
+; GFX1032-NEXT: s_mov_b32 s13, s40
+; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
+; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1032-NEXT: v_mov_b32_e32 v3, s43
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-NEXT: s_clause 0x1
+; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0
+; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-NEXT: s_cbranch_execnz .LBB10_1
+; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT: s_mov_b32 s33, s8
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1164-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT: s_mov_b32 s12, s6
+; GFX1164-NEXT: s_mov_b32 s13, s7
+; GFX1164-NEXT: s_mov_b32 s14, s33
+; GFX1164-NEXT: s_mov_b32 s32, 32
+; GFX1164-NEXT: v_mov_b32_e32 v42, v0
+; GFX1164-NEXT: s_mov_b32 s40, s7
+; GFX1164-NEXT: s_mov_b32 s41, s6
+; GFX1164-NEXT: v_mov_b32_e32 v43, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-NEXT: v_mov_b32_e32 v41, v1
+; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43]
+; GFX1164-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-NEXT: s_mov_b64 s[44:45], 0
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-NEXT: v_mov_b32_e32 v31, v42
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT: s_mov_b32 s12, s41
+; GFX1164-NEXT: s_mov_b32 s13, s40
+; GFX1164-NEXT: s_mov_b32 s14, s33
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v2, s42
+; GFX1164-NEXT: v_mov_b32_e32 v3, s43
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-NEXT: s_cbranch_execnz .LBB10_1
+; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1132-NEXT: s_mov_b32 s40, s14
+; GFX1132-NEXT: s_mov_b32 s41, s13
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_mov_b32 s13, s14
+; GFX1132-NEXT: s_mov_b32 s14, s15
+; GFX1132-NEXT: s_mov_b32 s32, 32
+; GFX1132-NEXT: s_mov_b32 s33, s15
+; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
+; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43]
+; GFX1132-NEXT: s_mov_b32 s44, 0
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT: s_mov_b32 s12, s41
+; GFX1132-NEXT: s_mov_b32 s13, s40
+; GFX1132-NEXT: s_mov_b32 s14, s33
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
+; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-NEXT: s_cbranch_execnz .LBB10_1
+; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s50, -1
+; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9
+; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
+; GFX9-DPP-NEXT: s_mov_b32 s33, s8
+; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_mov_b32 s40, s7
+; GFX9-DPP-NEXT: s_mov_b32 s41, s6
+; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-DPP-NEXT: s_mov_b32 s12, s41
+; GFX9-DPP-NEXT: s_mov_b32 s13, s40
+; GFX9-DPP-NEXT: s_mov_b32 s14, s33
+; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1
+; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
+; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-DPP-NEXT: s_mov_b32 s12, s41
+; GFX9-DPP-NEXT: s_mov_b32 s13, s40
+; GFX9-DPP-NEXT: s_mov_b32 s14, s33
+; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
+; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1
+; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s33, s8
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: s_mov_b32 s40, s7
+; GFX1064-DPP-NEXT: s_mov_b32 s41, s6
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
+; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-DPP-NEXT: s_clause 0x1
+; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_1
+; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s33, s8
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: s_mov_b32 s40, s7
+; GFX1032-DPP-NEXT: s_mov_b32 s41, s6
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
+; GFX1032-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
+; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-DPP-NEXT: s_clause 0x1
+; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_1
+; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1164-DPP-NEXT: s_mov_b32 s33, s8
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0
+; GFX1164-DPP-NEXT: s_mov_b32 s40, s7
+; GFX1164-DPP-NEXT: s_mov_b32 s41, s6
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1
+; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1164-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1
+; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 s40, s14
+; GFX1132-DPP-NEXT: s_mov_b32 s41, s13
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
+; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43]
+; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1132-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1
+; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-DPP-NEXT: s_endpgm
+ %divValue = call double @div.float.value()
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
+; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s14, -1
+; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s12, s12, s3
+; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB11_3
+; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: .LBB11_2: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_2
+; GFX7LESS-NEXT: .LBB11_3:
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-NEXT: s_add_u32 s8, s8, s3
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB11_3
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX9-NEXT: s_mov_b32 s3, 0x43300000
+; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-NEXT: .LBB11_2: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB11_2
+; GFX9-NEXT: .LBB11_3:
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s10, -1
+; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s8, s8, s3
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: s_addc_u32 s9, s9, 0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_cbranch_execz .LBB11_3
+; GFX1064-NEXT: ; %bb.1:
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: .LBB11_2: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1064-NEXT: .LBB11_3:
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s10, -1
+; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s8, s8, s3
+; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB11_3
+; GFX1032-NEXT: ; %bb.1:
+; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
+; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: .LBB11_2: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1032-NEXT: .LBB11_3:
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1164-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4
+; GFX1164-NEXT: scratch_store_b32 off, v1, off
+; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT: s_cbranch_execz .LBB11_3
+; GFX1164-NEXT: ; %bb.1:
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: .LBB11_2: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1164-NEXT: .LBB11_3:
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
+; GFX1132-NEXT: scratch_store_b32 off, v1, off
+; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT: s_cbranch_execz .LBB11_3
+; GFX1132-NEXT: ; %bb.1:
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: .LBB11_2: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1132-NEXT: .LBB11_3:
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s10, -1
+; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX9-DPP-NEXT: .LBB11_3:
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s10, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1064-DPP-NEXT: .LBB11_3:
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
+; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1032-DPP-NEXT: .LBB11_3:
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off
+; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1164-DPP-NEXT: .LBB11_3:
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
+; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1132-DPP-NEXT: .LBB11_3:
+; GFX1132-DPP-NEXT: s_endpgm
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
+; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_1
+; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s38, -1
+; GFX9-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-NEXT: s_add_u32 s36, s36, s9
+; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b32 s14, s8
+; GFX9-NEXT: s_add_u32 s8, s2, 44
+; GFX9-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_getpc_b64 s[2:3]
+; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: v_mov_b32_e32 v40, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execnz .LBB12_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s38, -1
+; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-NEXT: s_mov_b32 s14, s8
+; GFX1064-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT: s_mov_b32 s12, s6
+; GFX1064-NEXT: s_mov_b32 s13, s7
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT: s_mov_b32 s32, 0
+; GFX1064-NEXT: v_mov_b32_e32 v40, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
+; GFX1064-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1064-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-NEXT: v_mov_b32_e32 v4, v2
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execnz .LBB12_1
+; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s38, -1
+; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-NEXT: s_mov_b32 s14, s8
+; GFX1032-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT: s_mov_b32 s12, s6
+; GFX1032-NEXT: s_mov_b32 s13, s7
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT: s_mov_b32 s32, 0
+; GFX1032-NEXT: v_mov_b32_e32 v40, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1032-NEXT: s_mov_b32 s0, 0
+; GFX1032-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1032-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-NEXT: v_mov_b32_e32 v4, v2
+; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execnz .LBB12_1
+; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b32 s14, s8
+; GFX1164-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT: s_mov_b32 s12, s6
+; GFX1164-NEXT: s_mov_b32 s13, s7
+; GFX1164-NEXT: s_mov_b32 s32, 0
+; GFX1164-NEXT: v_mov_b32_e32 v40, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
+; GFX1164-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1164-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execnz .LBB12_1
+; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1132-NEXT: s_mov_b32 s13, s14
+; GFX1132-NEXT: s_mov_b32 s14, s15
+; GFX1132-NEXT: s_mov_b32 s32, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1132-NEXT: s_mov_b32 s0, 0
+; GFX1132-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execnz .LBB12_1
+; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s38, -1
+; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b32 s14, s8
+; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT: s_mov_b32 s12, s6
+; GFX9-DPP-NEXT: s_mov_b32 s13, s7
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT: s_mov_b32 s32, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_1
+; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX1064-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_1
+; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
+; GFX1032-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_1
+; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX1164-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_1
+; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
+; GFX1132-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_1
+; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_endpgm
+ %divValue = call double @div.double.value() strictfp
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{
+; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s14, -1
+; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s12, s12, s3
+; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3
+; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2
+; GFX7LESS-NEXT: .LBB13_3:
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-NEXT: s_add_u32 s8, s8, s3
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB13_3
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX9-NEXT: s_mov_b32 s3, 0x43300000
+; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB13_2
+; GFX9-NEXT: .LBB13_3:
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s10, -1
+; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s8, s8, s3
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: s_addc_u32 s9, s9, 0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_cbranch_execz .LBB13_3
+; GFX1064-NEXT: ; %bb.1:
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB13_2
+; GFX1064-NEXT: .LBB13_3:
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s10, -1
+; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s8, s8, s3
+; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB13_3
+; GFX1032-NEXT: ; %bb.1:
+; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
+; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB13_2
+; GFX1032-NEXT: .LBB13_3:
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1164-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4
+; GFX1164-NEXT: scratch_store_b32 off, v1, off
+; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT: s_cbranch_execz .LBB13_3
+; GFX1164-NEXT: ; %bb.1:
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB13_2
+; GFX1164-NEXT: .LBB13_3:
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
+; GFX1132-NEXT: scratch_store_b32 off, v1, off
+; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT: s_cbranch_execz .LBB13_3
+; GFX1132-NEXT: ; %bb.1:
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB13_2
+; GFX1132-NEXT: .LBB13_3:
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s10, -1
+; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2
+; GFX9-DPP-NEXT: .LBB13_3:
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s10, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2
+; GFX1064-DPP-NEXT: .LBB13_3:
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
+; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2
+; GFX1032-DPP-NEXT: .LBB13_3:
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off
+; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2
+; GFX1164-DPP-NEXT: .LBB13_3:
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
+; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2
+; GFX1132-DPP-NEXT: .LBB13_3:
+; GFX1132-DPP-NEXT: s_endpgm
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_1
+; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s38, -1
+; GFX9-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-NEXT: s_add_u32 s36, s36, s9
+; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b32 s14, s8
+; GFX9-NEXT: s_add_u32 s8, s2, 44
+; GFX9-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_getpc_b64 s[2:3]
+; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: v_mov_b32_e32 v40, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execnz .LBB14_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s38, -1
+; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-NEXT: s_mov_b32 s14, s8
+; GFX1064-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT: s_mov_b32 s12, s6
+; GFX1064-NEXT: s_mov_b32 s13, s7
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT: s_mov_b32 s32, 0
+; GFX1064-NEXT: v_mov_b32_e32 v40, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
+; GFX1064-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1064-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-NEXT: v_mov_b32_e32 v4, v2
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execnz .LBB14_1
+; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s38, -1
+; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-NEXT: s_mov_b32 s14, s8
+; GFX1032-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT: s_mov_b32 s12, s6
+; GFX1032-NEXT: s_mov_b32 s13, s7
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT: s_mov_b32 s32, 0
+; GFX1032-NEXT: v_mov_b32_e32 v40, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1032-NEXT: s_mov_b32 s0, 0
+; GFX1032-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1032-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-NEXT: v_mov_b32_e32 v4, v2
+; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execnz .LBB14_1
+; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b32 s14, s8
+; GFX1164-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT: s_mov_b32 s12, s6
+; GFX1164-NEXT: s_mov_b32 s13, s7
+; GFX1164-NEXT: s_mov_b32 s32, 0
+; GFX1164-NEXT: v_mov_b32_e32 v40, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
+; GFX1164-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1164-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execnz .LBB14_1
+; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1132-NEXT: s_mov_b32 s13, s14
+; GFX1132-NEXT: s_mov_b32 s14, s15
+; GFX1132-NEXT: s_mov_b32 s32, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1132-NEXT: s_mov_b32 s0, 0
+; GFX1132-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execnz .LBB14_1
+; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s38, -1
+; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b32 s14, s8
+; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT: s_mov_b32 s12, s6
+; GFX9-DPP-NEXT: s_mov_b32 s13, s7
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT: s_mov_b32 s32, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_1
+; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX1064-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_1
+; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
+; GFX1032-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_1
+; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX1164-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_1
+; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
+; GFX1132-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_1
+; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_endpgm
+ %divValue = call double @div.double.value()
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
+; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_1
+; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s38, -1
+; GFX9-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-NEXT: s_add_u32 s36, s36, s9
+; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b32 s14, s8
+; GFX9-NEXT: s_add_u32 s8, s2, 44
+; GFX9-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_getpc_b64 s[2:3]
+; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: v_mov_b32_e32 v40, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execnz .LBB15_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s38, -1
+; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-NEXT: s_mov_b32 s14, s8
+; GFX1064-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT: s_mov_b32 s12, s6
+; GFX1064-NEXT: s_mov_b32 s13, s7
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT: s_mov_b32 s32, 0
+; GFX1064-NEXT: v_mov_b32_e32 v40, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
+; GFX1064-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1064-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-NEXT: v_mov_b32_e32 v4, v2
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execnz .LBB15_1
+; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s38, -1
+; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-NEXT: s_mov_b32 s14, s8
+; GFX1032-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT: s_mov_b32 s12, s6
+; GFX1032-NEXT: s_mov_b32 s13, s7
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT: s_mov_b32 s32, 0
+; GFX1032-NEXT: v_mov_b32_e32 v40, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1032-NEXT: s_mov_b32 s0, 0
+; GFX1032-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1032-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-NEXT: v_mov_b32_e32 v4, v2
+; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execnz .LBB15_1
+; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b32 s14, s8
+; GFX1164-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT: s_mov_b32 s12, s6
+; GFX1164-NEXT: s_mov_b32 s13, s7
+; GFX1164-NEXT: s_mov_b32 s32, 0
+; GFX1164-NEXT: v_mov_b32_e32 v40, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
+; GFX1164-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1164-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execnz .LBB15_1
+; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1132-NEXT: s_mov_b32 s13, s14
+; GFX1132-NEXT: s_mov_b32 s14, s15
+; GFX1132-NEXT: s_mov_b32 s32, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1132-NEXT: s_mov_b32 s0, 0
+; GFX1132-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execnz .LBB15_1
+; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s38, -1
+; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b32 s14, s8
+; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT: s_mov_b32 s12, s6
+; GFX9-DPP-NEXT: s_mov_b32 s13, s7
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT: s_mov_b32 s32, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_1
+; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX1064-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_1
+; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
+; GFX1032-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_1
+; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX1164-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_1
+; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
+; GFX1132-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_1
+; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_endpgm
+ %divValue = call double @div.float.value() strictfp
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 {
+; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
+; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s40, s40, s3
+; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s33, s2
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB16_3
+; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX7LESS-NEXT: s_mov_b32 s1, 0x43300000
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_load_dwordx2 s[2:3], s[36:37], 0x0
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1]
+; GFX7LESS-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3
+; GFX7LESS-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[41:42]
+; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX7LESS-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
+; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX7LESS-NEXT: s_waitcnt expcnt(2)
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
+; GFX7LESS-NEXT: s_mov_b32 s12, s33
+; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2
+; GFX7LESS-NEXT: .LBB16_3:
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s42, -1
+; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: s_mov_b32 s43, 0xe00000
+; GFX9-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX9-NEXT: s_add_u32 s40, s40, s3
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX9-NEXT: s_addc_u32 s41, s41, 0
+; GFX9-NEXT: s_mov_b32 s33, s2
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_movk_i32 s32, 0x800
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB16_3
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX9-NEXT: s_mov_b32 s1, 0x43300000
+; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX9-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX9-NEXT: s_mov_b32 s12, s33
+; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX9-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, s36
+; GFX9-NEXT: v_mov_b32_e32 v3, s37
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-NEXT: s_cbranch_execnz .LBB16_2
+; GFX9-NEXT: .LBB16_3:
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s42, -1
+; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s40, s40, s3
+; GFX1064-NEXT: s_mov_b32 s33, s2
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: s_addc_u32 s41, s41, 0
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1064-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_cbranch_execz .LBB16_3
+; GFX1064-NEXT: ; %bb.1:
+; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s1, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
+; GFX1064-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1064-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v2, s1
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1064-NEXT: s_mov_b32 s12, s33
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: v_mov_b32_e32 v2, s36
+; GFX1064-NEXT: v_mov_b32_e32 v3, s37
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1064-NEXT: s_clause 0x1
+; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-NEXT: s_cbranch_execnz .LBB16_2
+; GFX1064-NEXT: .LBB16_3:
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s33, s2
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032-NEXT: v_mov_b32_e32 v40, v0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s42, -1
+; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_add_u32 s40, s40, s3
+; GFX1032-NEXT: s_addc_u32 s41, s41, 0
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1032-NEXT: s_mov_b32 s38, 0
+; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB16_3
+; GFX1032-NEXT: ; %bb.1:
+; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s2
+; GFX1032-NEXT: s_mov_b32 s1, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1032-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v2, s1
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1032-NEXT: s_mov_b32 s12, s33
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-NEXT: v_mov_b32_e32 v2, s36
+; GFX1032-NEXT: v_mov_b32_e32 v3, s37
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1032-NEXT: s_clause 0x1
+; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-NEXT: s_cbranch_execnz .LBB16_2
+; GFX1032-NEXT: .LBB16_3:
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec
+; GFX1164-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:20
+; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:16
+; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:16
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-NEXT: s_mov_b32 s32, 32
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT: s_cbranch_execz .LBB16_3
+; GFX1164-NEXT: ; %bb.1:
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1164-NEXT: s_mov_b32 s33, s2
+; GFX1164-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: v_mov_b32_e32 v2, s1
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-NEXT: s_mov_b32 s12, s33
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v2, s36
+; GFX1164-NEXT: v_mov_b32_e32 v3, s37
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-NEXT: s_cbranch_execnz .LBB16_2
+; GFX1164-NEXT: .LBB16_3:
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0
+; GFX1132-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:20
+; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:16
+; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:16
+; GFX1132-NEXT: s_mov_b32 s38, 0
+; GFX1132-NEXT: s_mov_b32 s32, 32
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT: s_cbranch_execz .LBB16_3
+; GFX1132-NEXT: ; %bb.1:
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1132-NEXT: s_mov_b32 s33, s15
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-NEXT: s_mov_b32 s12, s33
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
+; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-NEXT: s_cbranch_execnz .LBB16_2
+; GFX1132-NEXT: .LBB16_3:
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s42, -1
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX9-DPP-NEXT: s_mov_b32 s33, s2
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB16_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000
+; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1]
+; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX9-DPP-NEXT: s_mov_b32 s12, s33
+; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2
+; GFX9-DPP-NEXT: .LBB16_3:
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s42, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3
+; GFX1064-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1064-DPP-NEXT: s_clause 0x1
+; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2
+; GFX1064-DPP-NEXT: .LBB16_3:
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s42, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3
+; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1032-DPP-NEXT: s_mov_b32 s38, 0
+; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1032-DPP-NEXT: s_clause 0x1
+; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2
+; GFX1032-DPP-NEXT: .LBB16_3:
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:20
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:16
+; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB16_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1164-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2
+; GFX1164-DPP-NEXT: .LBB16_3:
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:20
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:16
+; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16
+; GFX1132-DPP-NEXT: s_mov_b32 s38, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[41:42]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2
+; GFX1132-DPP-NEXT: .LBB16_3:
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-DPP-NEXT: s_endpgm
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double 4.0 monotonic, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 {
+; GFX7LESS-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
+; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s50, -1
+; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s48, s48, s9
+; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0
+; GFX7LESS-NEXT: s_mov_b32 s33, s8
+; GFX7LESS-NEXT: s_mov_b32 s40, s7
+; GFX7LESS-NEXT: s_mov_b32 s41, s6
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3]
+; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s46, -1
+; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
+; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX7LESS-NEXT: s_mov_b32 s12, s41
+; GFX7LESS-NEXT: s_mov_b32 s13, s40
+; GFX7LESS-NEXT: s_mov_b32 s14, s33
+; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0
+; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0
+; GFX7LESS-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], v[40:41]
+; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
+; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
+; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: s_waitcnt expcnt(2)
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX7LESS-NEXT: s_mov_b32 s12, s41
+; GFX7LESS-NEXT: s_mov_b32 s13, s40
+; GFX7LESS-NEXT: s_mov_b32 s14, s33
+; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43]
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_1
+; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s50, -1
+; GFX9-NEXT: s_mov_b32 s51, 0xe00000
+; GFX9-NEXT: s_add_u32 s48, s48, s9
+; GFX9-NEXT: s_addc_u32 s49, s49, 0
+; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3]
+; GFX9-NEXT: s_mov_b32 s33, s8
+; GFX9-NEXT: s_add_u32 s8, s36, 44
+; GFX9-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX9-NEXT: s_mov_b32 s40, s7
+; GFX9-NEXT: s_mov_b32 s41, s6
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-NEXT: s_mov_b32 s12, s41
+; GFX9-NEXT: s_mov_b32 s13, s40
+; GFX9-NEXT: s_mov_b32 s14, s33
+; GFX9-NEXT: v_mov_b32_e32 v31, v42
+; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT: s_movk_i32 s32, 0x800
+; GFX9-NEXT: v_mov_b32_e32 v43, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v41, v1
+; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
+; GFX9-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX9-NEXT: s_add_u32 s8, s36, 44
+; GFX9-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
+; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-NEXT: s_mov_b32 s12, s41
+; GFX9-NEXT: s_mov_b32 s13, s40
+; GFX9-NEXT: s_mov_b32 s14, s33
+; GFX9-NEXT: v_mov_b32_e32 v31, v42
+; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, s42
+; GFX9-NEXT: v_mov_b32_e32 v3, s43
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0
+; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-NEXT: s_cbranch_execnz .LBB17_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s50, -1
+; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s48, s48, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1064-NEXT: s_addc_u32 s49, s49, 0
+; GFX1064-NEXT: s_mov_b32 s33, s8
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1064-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1064-NEXT: s_mov_b32 s40, s7
+; GFX1064-NEXT: s_mov_b32 s41, s6
+; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s12, s41
+; GFX1064-NEXT: s_mov_b32 s13, s40
+; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT: v_mov_b32_e32 v31, v42
+; GFX1064-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-NEXT: v_mov_b32_e32 v43, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-NEXT: v_mov_b32_e32 v41, v1
+; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
+; GFX1064-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v31, v42
+; GFX1064-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: v_mov_b32_e32 v2, s42
+; GFX1064-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-NEXT: s_mov_b32 s12, s41
+; GFX1064-NEXT: s_mov_b32 s13, s40
+; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
+; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1064-NEXT: v_mov_b32_e32 v3, s43
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-NEXT: s_clause 0x1
+; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0
+; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-NEXT: s_cbranch_execnz .LBB17_1
+; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s50, -1
+; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s48, s48, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1032-NEXT: s_addc_u32 s49, s49, 0
+; GFX1032-NEXT: s_mov_b32 s33, s8
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1032-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1032-NEXT: s_mov_b32 s40, s7
+; GFX1032-NEXT: s_mov_b32 s41, s6
+; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s12, s41
+; GFX1032-NEXT: s_mov_b32 s13, s40
+; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT: v_mov_b32_e32 v31, v42
+; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: v_mov_b32_e32 v43, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-NEXT: v_mov_b32_e32 v41, v1
+; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
+; GFX1032-NEXT: v_mov_b32_e32 v40, v0
+; GFX1032-NEXT: s_mov_b32 s44, 0
+; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v31, v42
+; GFX1032-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-NEXT: v_mov_b32_e32 v2, s42
+; GFX1032-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-NEXT: s_mov_b32 s12, s41
+; GFX1032-NEXT: s_mov_b32 s13, s40
+; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
+; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1032-NEXT: v_mov_b32_e32 v3, s43
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-NEXT: s_clause 0x1
+; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0
+; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-NEXT: s_cbranch_execnz .LBB17_1
+; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT: s_mov_b32 s33, s8
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1164-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT: s_mov_b32 s12, s6
+; GFX1164-NEXT: s_mov_b32 s13, s7
+; GFX1164-NEXT: s_mov_b32 s14, s33
+; GFX1164-NEXT: s_mov_b32 s32, 32
+; GFX1164-NEXT: v_mov_b32_e32 v42, v0
+; GFX1164-NEXT: s_mov_b32 s40, s7
+; GFX1164-NEXT: s_mov_b32 s41, s6
+; GFX1164-NEXT: v_mov_b32_e32 v43, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-NEXT: v_mov_b32_e32 v41, v1
+; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43]
+; GFX1164-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-NEXT: s_mov_b64 s[44:45], 0
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-NEXT: v_mov_b32_e32 v31, v42
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT: s_mov_b32 s12, s41
+; GFX1164-NEXT: s_mov_b32 s13, s40
+; GFX1164-NEXT: s_mov_b32 s14, s33
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v2, s42
+; GFX1164-NEXT: v_mov_b32_e32 v3, s43
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-NEXT: s_cbranch_execnz .LBB17_1
+; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1132-NEXT: s_mov_b32 s40, s14
+; GFX1132-NEXT: s_mov_b32 s41, s13
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_mov_b32 s13, s14
+; GFX1132-NEXT: s_mov_b32 s14, s15
+; GFX1132-NEXT: s_mov_b32 s32, 32
+; GFX1132-NEXT: s_mov_b32 s33, s15
+; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
+; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43]
+; GFX1132-NEXT: s_mov_b32 s44, 0
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT: s_mov_b32 s12, s41
+; GFX1132-NEXT: s_mov_b32 s13, s40
+; GFX1132-NEXT: s_mov_b32 s14, s33
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
+; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-NEXT: s_cbranch_execnz .LBB17_1
+; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s50, -1
+; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9
+; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
+; GFX9-DPP-NEXT: s_mov_b32 s33, s8
+; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_mov_b32 s40, s7
+; GFX9-DPP-NEXT: s_mov_b32 s41, s6
+; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-DPP-NEXT: s_mov_b32 s12, s41
+; GFX9-DPP-NEXT: s_mov_b32 s13, s40
+; GFX9-DPP-NEXT: s_mov_b32 s14, s33
+; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1
+; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
+; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-DPP-NEXT: s_mov_b32 s12, s41
+; GFX9-DPP-NEXT: s_mov_b32 s13, s40
+; GFX9-DPP-NEXT: s_mov_b32 s14, s33
+; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
+; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_1
+; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s33, s8
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: s_mov_b32 s40, s7
+; GFX1064-DPP-NEXT: s_mov_b32 s41, s6
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
+; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-DPP-NEXT: s_clause 0x1
+; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_1
+; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s33, s8
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: s_mov_b32 s40, s7
+; GFX1032-DPP-NEXT: s_mov_b32 s41, s6
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
+; GFX1032-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
+; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-DPP-NEXT: s_clause 0x1
+; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_1
+; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1164-DPP-NEXT: s_mov_b32 s33, s8
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0
+; GFX1164-DPP-NEXT: s_mov_b32 s40, s7
+; GFX1164-DPP-NEXT: s_mov_b32 s41, s6
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1
+; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1164-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_1
+; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fadd_double_uni_address_div_value_defalut_scope_strictfp:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 s40, s14
+; GFX1132-DPP-NEXT: s_mov_b32 s41, s13
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
+; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43]
+; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], v[40:41]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1132-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_1
+; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-DPP-NEXT: s_endpgm
+ %divValue = call double @div.float.value() strictfp
+ %result = atomicrmw fadd ptr addrspace(1) %ptr, double %divValue monotonic, align 4
+ ret void
+}
+
attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
attributes #2 = { strictfp}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
index 3cc5a4c..622be43 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmax.ll
@@ -13,6 +13,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
declare float @div.float.value()
+declare float @div.double.value()
define amdgpu_kernel void @global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
; GFX7LESS-LABEL: global_atomic_fmax_uni_address_uni_value_agent_scope_unsafe:
@@ -3550,6 +3551,3965 @@ define amdgpu_kernel void @global_atomic_fmax_uni_address_div_value_defalut_scop
ret void
}
+define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
+; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s40, s40, s3
+; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3
+; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_mov_b32 s33, s2
+; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1
+; GFX7LESS-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1]
+; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], 4.0
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
+; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:12
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:8
+; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
+; GFX7LESS-NEXT: s_mov_b32 s12, s33
+; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2
+; GFX7LESS-NEXT: .LBB6_3:
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s42, -1
+; GFX9-NEXT: s_mov_b32 s43, 0xe00000
+; GFX9-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: s_add_u32 s40, s40, s3
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: s_addc_u32 s41, s41, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_movk_i32 s32, 0x800
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB6_3
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX9-NEXT: s_mov_b32 s33, s2
+; GFX9-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX9-NEXT: s_mov_b32 s12, s33
+; GFX9-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX9-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, s36
+; GFX9-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-NEXT: v_mov_b32_e32 v3, s37
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-NEXT: s_cbranch_execnz .LBB6_2
+; GFX9-NEXT: .LBB6_3:
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s42, -1
+; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: s_add_u32 s40, s40, s3
+; GFX1064-NEXT: s_addc_u32 s41, s41, 0
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1064-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_cbranch_execz .LBB6_3
+; GFX1064-NEXT: ; %bb.1:
+; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1064-NEXT: s_mov_b32 s33, s2
+; GFX1064-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v2, s1
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1064-NEXT: s_mov_b32 s12, s33
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: v_mov_b32_e32 v2, s36
+; GFX1064-NEXT: v_mov_b32_e32 v3, s37
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1064-NEXT: s_clause 0x1
+; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-NEXT: s_cbranch_execnz .LBB6_2
+; GFX1064-NEXT: .LBB6_3:
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: v_mov_b32_e32 v40, v0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s42, -1
+; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_add_u32 s40, s40, s3
+; GFX1032-NEXT: s_addc_u32 s41, s41, 0
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1032-NEXT: s_mov_b32 s38, 0
+; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB6_3
+; GFX1032-NEXT: ; %bb.1:
+; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1032-NEXT: s_mov_b32 s33, s2
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v2, s1
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1032-NEXT: s_mov_b32 s12, s33
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1032-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-NEXT: v_mov_b32_e32 v2, s36
+; GFX1032-NEXT: v_mov_b32_e32 v3, s37
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1032-NEXT: s_clause 0x1
+; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-NEXT: s_cbranch_execnz .LBB6_2
+; GFX1032-NEXT: .LBB6_3:
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1164-NEXT: s_mov_b32 s32, 32
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_cbranch_execz .LBB6_3
+; GFX1164-NEXT: ; %bb.1:
+; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1164-NEXT: s_mov_b32 s33, s2
+; GFX1164-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: v_mov_b32_e32 v2, s1
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-NEXT: s_mov_b32 s12, s33
+; GFX1164-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v2, s36
+; GFX1164-NEXT: v_mov_b32_e32 v3, s37
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-NEXT: s_cbranch_execnz .LBB6_2
+; GFX1164-NEXT: .LBB6_3:
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: v_mov_b32_e32 v40, v0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1132-NEXT: s_mov_b32 s38, 0
+; GFX1132-NEXT: s_mov_b32 s32, 32
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_cbranch_execz .LBB6_3
+; GFX1132-NEXT: ; %bb.1:
+; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1132-NEXT: s_mov_b32 s33, s15
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-NEXT: s_mov_b32 s12, s33
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
+; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-NEXT: s_cbranch_execnz .LBB6_2
+; GFX1132-NEXT: .LBB6_3:
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s42, -1
+; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX9-DPP-NEXT: s_mov_b32 s33, s2
+; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX9-DPP-NEXT: s_mov_b32 s12, s33
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2
+; GFX9-DPP-NEXT: .LBB6_3:
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s42, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3
+; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1064-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1064-DPP-NEXT: s_clause 0x1
+; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2
+; GFX1064-DPP-NEXT: .LBB6_3:
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s42, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3
+; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1032-DPP-NEXT: s_mov_b32 s38, 0
+; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1032-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1032-DPP-NEXT: s_clause 0x1
+; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2
+; GFX1032-DPP-NEXT: .LBB6_3:
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1164-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2
+; GFX1164-DPP-NEXT: .LBB6_3:
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1132-DPP-NEXT: s_mov_b32 s38, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2
+; GFX1132-DPP-NEXT: .LBB6_3:
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-DPP-NEXT: s_endpgm
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
+; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s50, -1
+; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s48, s48, s9
+; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0
+; GFX7LESS-NEXT: s_mov_b32 s33, s8
+; GFX7LESS-NEXT: s_mov_b32 s40, s7
+; GFX7LESS-NEXT: s_mov_b32 s41, s6
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3]
+; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s46, -1
+; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
+; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX7LESS-NEXT: s_mov_b32 s12, s41
+; GFX7LESS-NEXT: s_mov_b32 s13, s40
+; GFX7LESS-NEXT: s_mov_b32 s14, s33
+; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0
+; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0
+; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
+; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX7LESS-NEXT: s_mov_b32 s12, s41
+; GFX7LESS-NEXT: s_mov_b32 s13, s40
+; GFX7LESS-NEXT: s_mov_b32 s14, s33
+; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43]
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1
+; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s50, -1
+; GFX9-NEXT: s_mov_b32 s51, 0xe00000
+; GFX9-NEXT: s_add_u32 s48, s48, s9
+; GFX9-NEXT: s_addc_u32 s49, s49, 0
+; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3]
+; GFX9-NEXT: s_mov_b32 s33, s8
+; GFX9-NEXT: s_add_u32 s8, s36, 44
+; GFX9-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX9-NEXT: s_mov_b32 s40, s7
+; GFX9-NEXT: s_mov_b32 s41, s6
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-NEXT: s_mov_b32 s12, s41
+; GFX9-NEXT: s_mov_b32 s13, s40
+; GFX9-NEXT: s_mov_b32 s14, s33
+; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT: s_movk_i32 s32, 0x800
+; GFX9-NEXT: v_mov_b32_e32 v41, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
+; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX9-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-NEXT: s_add_u32 s8, s36, 44
+; GFX9-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-NEXT: s_mov_b32 s12, s41
+; GFX9-NEXT: s_mov_b32 s13, s40
+; GFX9-NEXT: s_mov_b32 s14, s33
+; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT: v_mov_b32_e32 v2, s42
+; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX9-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s43
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-NEXT: s_cbranch_execnz .LBB7_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s50, -1
+; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s48, s48, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1064-NEXT: s_addc_u32 s49, s49, 0
+; GFX1064-NEXT: s_mov_b32 s33, s8
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1064-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1064-NEXT: s_mov_b32 s40, s7
+; GFX1064-NEXT: s_mov_b32 s41, s6
+; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s12, s41
+; GFX1064-NEXT: s_mov_b32 s13, s40
+; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-NEXT: v_mov_b32_e32 v41, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
+; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1064-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-NEXT: v_mov_b32_e32 v2, s42
+; GFX1064-NEXT: v_mov_b32_e32 v3, s43
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-NEXT: s_mov_b32 s12, s41
+; GFX1064-NEXT: s_mov_b32 s13, s40
+; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1064-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-NEXT: s_clause 0x1
+; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-NEXT: s_cbranch_execnz .LBB7_1
+; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s50, -1
+; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s48, s48, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1032-NEXT: s_addc_u32 s49, s49, 0
+; GFX1032-NEXT: s_mov_b32 s33, s8
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1032-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1032-NEXT: s_mov_b32 s40, s7
+; GFX1032-NEXT: s_mov_b32 s41, s6
+; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s12, s41
+; GFX1032-NEXT: s_mov_b32 s13, s40
+; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: v_mov_b32_e32 v41, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
+; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1032-NEXT: s_mov_b32 s44, 0
+; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-NEXT: v_mov_b32_e32 v2, s42
+; GFX1032-NEXT: v_mov_b32_e32 v3, s43
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-NEXT: s_mov_b32 s12, s41
+; GFX1032-NEXT: s_mov_b32 s13, s40
+; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1032-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-NEXT: s_clause 0x1
+; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-NEXT: s_cbranch_execnz .LBB7_1
+; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT: s_mov_b32 s33, s8
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1164-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT: s_mov_b32 s12, s6
+; GFX1164-NEXT: s_mov_b32 s13, s7
+; GFX1164-NEXT: s_mov_b32 s14, s33
+; GFX1164-NEXT: s_mov_b32 s32, 32
+; GFX1164-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-NEXT: s_mov_b32 s40, s7
+; GFX1164-NEXT: s_mov_b32 s41, s6
+; GFX1164-NEXT: v_mov_b32_e32 v41, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43]
+; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1164-NEXT: s_mov_b64 s[44:45], 0
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT: s_mov_b32 s12, s41
+; GFX1164-NEXT: s_mov_b32 s13, s40
+; GFX1164-NEXT: s_mov_b32 s14, s33
+; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off
+; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v2, s42
+; GFX1164-NEXT: v_mov_b32_e32 v3, s43
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off
+; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-NEXT: s_cbranch_execnz .LBB7_1
+; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1132-NEXT: s_mov_b32 s40, s14
+; GFX1132-NEXT: s_mov_b32 s41, s13
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_mov_b32 s13, s14
+; GFX1132-NEXT: s_mov_b32 s14, s15
+; GFX1132-NEXT: s_mov_b32 s32, 32
+; GFX1132-NEXT: s_mov_b32 s33, s15
+; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43]
+; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1132-NEXT: s_mov_b32 s44, 0
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-NEXT: v_mov_b32_e32 v31, v40
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT: s_mov_b32 s12, s41
+; GFX1132-NEXT: s_mov_b32 s13, s40
+; GFX1132-NEXT: s_mov_b32 s14, s33
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off
+; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off
+; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-NEXT: s_cbranch_execnz .LBB7_1
+; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s50, -1
+; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9
+; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
+; GFX9-DPP-NEXT: s_mov_b32 s33, s8
+; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_mov_b32 s40, s7
+; GFX9-DPP-NEXT: s_mov_b32 s41, s6
+; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-DPP-NEXT: s_mov_b32 s12, s41
+; GFX9-DPP-NEXT: s_mov_b32 s13, s40
+; GFX9-DPP-NEXT: s_mov_b32 s14, s33
+; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
+; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-DPP-NEXT: s_mov_b32 s12, s41
+; GFX9-DPP-NEXT: s_mov_b32 s13, s40
+; GFX9-DPP-NEXT: s_mov_b32 s14, s33
+; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1
+; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s33, s8
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: s_mov_b32 s40, s7
+; GFX1064-DPP-NEXT: s_mov_b32 s41, s6
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
+; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-DPP-NEXT: s_clause 0x1
+; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1
+; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s33, s8
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: s_mov_b32 s40, s7
+; GFX1032-DPP-NEXT: s_mov_b32 s41, s6
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
+; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
+; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-DPP-NEXT: s_clause 0x1
+; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1
+; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1164-DPP-NEXT: s_mov_b32 s33, s8
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-DPP-NEXT: s_mov_b32 s40, s7
+; GFX1164-DPP-NEXT: s_mov_b32 s41, s6
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43]
+; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1164-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off
+; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1
+; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 s40, s14
+; GFX1132-DPP-NEXT: s_mov_b32 s41, s13
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43]
+; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1132-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off
+; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1
+; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-DPP-NEXT: s_endpgm
+ %divValue = call double @div.double.value()
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB8_3
+; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_2
+; GFX7LESS-NEXT: .LBB8_3:
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB8_3
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB8_2
+; GFX9-NEXT: .LBB8_3:
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-NEXT: s_cbranch_execz .LBB8_3
+; GFX1064-NEXT: ; %bb.1:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB8_2
+; GFX1064-NEXT: .LBB8_3:
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB8_3
+; GFX1032-NEXT: ; %bb.1:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB8_2
+; GFX1032-NEXT: .LBB8_3:
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_cbranch_execz .LBB8_3
+; GFX1164-NEXT: ; %bb.1:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB8_2
+; GFX1164-NEXT: .LBB8_3:
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_cbranch_execz .LBB8_3
+; GFX1132-NEXT: ; %bb.1:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB8_2
+; GFX1132-NEXT: .LBB8_3:
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2
+; GFX9-DPP-NEXT: .LBB8_3:
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2
+; GFX1064-DPP-NEXT: .LBB8_3:
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2
+; GFX1032-DPP-NEXT: .LBB8_3:
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2
+; GFX1164-DPP-NEXT: .LBB8_3:
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], 4.0
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2
+; GFX1132-DPP-NEXT: .LBB8_3:
+; GFX1132-DPP-NEXT: s_endpgm
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX7LESS-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_1
+; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s38, -1
+; GFX9-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-NEXT: s_add_u32 s36, s36, s9
+; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b32 s14, s8
+; GFX9-NEXT: s_add_u32 s8, s2, 44
+; GFX9-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_getpc_b64 s[2:3]
+; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: v_mov_b32_e32 v40, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execnz .LBB9_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s38, -1
+; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-NEXT: s_mov_b32 s14, s8
+; GFX1064-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT: s_mov_b32 s12, s6
+; GFX1064-NEXT: s_mov_b32 s13, s7
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT: s_mov_b32 s32, 0
+; GFX1064-NEXT: v_mov_b32_e32 v40, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
+; GFX1064-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s38, -1
+; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-NEXT: s_mov_b32 s14, s8
+; GFX1032-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT: s_mov_b32 s12, s6
+; GFX1032-NEXT: s_mov_b32 s13, s7
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT: s_mov_b32 s32, 0
+; GFX1032-NEXT: v_mov_b32_e32 v40, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1032-NEXT: s_mov_b32 s0, 0
+; GFX1032-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b32 s14, s8
+; GFX1164-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT: s_mov_b32 s12, s6
+; GFX1164-NEXT: s_mov_b32 s13, s7
+; GFX1164-NEXT: s_mov_b32 s32, 0
+; GFX1164-NEXT: v_mov_b32_e32 v40, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35]
+; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
+; GFX1164-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1132-NEXT: s_mov_b32 s13, s14
+; GFX1132-NEXT: s_mov_b32 s14, s15
+; GFX1132-NEXT: s_mov_b32 s32, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35]
+; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1132-NEXT: s_mov_b32 s0, 0
+; GFX1132-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s38, -1
+; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b32 s14, s8
+; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT: s_mov_b32 s12, s6
+; GFX9-DPP-NEXT: s_mov_b32 s13, s7
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT: s_mov_b32 s32, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_1
+; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX1064-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
+; GFX1032-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX1164-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
+; GFX1132-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[4:5]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_endpgm
+ %divValue = call double @div.double.value()
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
+; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s40, s40, s3
+; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3
+; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_mov_b32 s33, s2
+; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1
+; GFX7LESS-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1]
+; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], 4.0
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
+; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:12
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:8
+; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
+; GFX7LESS-NEXT: s_mov_b32 s12, s33
+; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2
+; GFX7LESS-NEXT: .LBB10_3:
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s42, -1
+; GFX9-NEXT: s_mov_b32 s43, 0xe00000
+; GFX9-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: s_add_u32 s40, s40, s3
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: s_addc_u32 s41, s41, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_movk_i32 s32, 0x800
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB10_3
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX9-NEXT: s_mov_b32 s33, s2
+; GFX9-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX9-NEXT: s_mov_b32 s12, s33
+; GFX9-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX9-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, s36
+; GFX9-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-NEXT: v_mov_b32_e32 v3, s37
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-NEXT: s_cbranch_execnz .LBB10_2
+; GFX9-NEXT: .LBB10_3:
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s42, -1
+; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: s_add_u32 s40, s40, s3
+; GFX1064-NEXT: s_addc_u32 s41, s41, 0
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1064-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_cbranch_execz .LBB10_3
+; GFX1064-NEXT: ; %bb.1:
+; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1064-NEXT: s_mov_b32 s33, s2
+; GFX1064-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v2, s1
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1064-NEXT: s_mov_b32 s12, s33
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1064-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: v_mov_b32_e32 v2, s36
+; GFX1064-NEXT: v_mov_b32_e32 v3, s37
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1064-NEXT: s_clause 0x1
+; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1064-NEXT: .LBB10_3:
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: v_mov_b32_e32 v40, v0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s42, -1
+; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_add_u32 s40, s40, s3
+; GFX1032-NEXT: s_addc_u32 s41, s41, 0
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1032-NEXT: s_mov_b32 s38, 0
+; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB10_3
+; GFX1032-NEXT: ; %bb.1:
+; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1032-NEXT: s_mov_b32 s33, s2
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v2, s1
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1032-NEXT: s_mov_b32 s12, s33
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1032-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-NEXT: v_mov_b32_e32 v2, s36
+; GFX1032-NEXT: v_mov_b32_e32 v3, s37
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1032-NEXT: s_clause 0x1
+; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1032-NEXT: .LBB10_3:
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1164-NEXT: s_mov_b32 s32, 32
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_cbranch_execz .LBB10_3
+; GFX1164-NEXT: ; %bb.1:
+; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1164-NEXT: s_mov_b32 s33, s2
+; GFX1164-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: v_mov_b32_e32 v2, s1
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-NEXT: s_mov_b32 s12, s33
+; GFX1164-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v2, s36
+; GFX1164-NEXT: v_mov_b32_e32 v3, s37
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1164-NEXT: .LBB10_3:
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: v_mov_b32_e32 v40, v0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1132-NEXT: s_mov_b32 s38, 0
+; GFX1132-NEXT: s_mov_b32 s32, 32
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_cbranch_execz .LBB10_3
+; GFX1132-NEXT: ; %bb.1:
+; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1132-NEXT: s_mov_b32 s33, s15
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-NEXT: s_mov_b32 s12, s33
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
+; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1132-NEXT: .LBB10_3:
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s42, -1
+; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX9-DPP-NEXT: s_mov_b32 s33, s2
+; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX9-DPP-NEXT: s_mov_b32 s12, s33
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX9-DPP-NEXT: .LBB10_3:
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s42, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3
+; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1064-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1064-DPP-NEXT: s_clause 0x1
+; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1064-DPP-NEXT: .LBB10_3:
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s42, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3
+; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1032-DPP-NEXT: s_mov_b32 s38, 0
+; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1032-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1032-DPP-NEXT: s_clause 0x1
+; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1032-DPP-NEXT: .LBB10_3:
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1164-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1164-DPP-NEXT: .LBB10_3:
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1132-DPP-NEXT: s_mov_b32 s38, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[3:4], 4.0
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1132-DPP-NEXT: .LBB10_3:
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-DPP-NEXT: s_endpgm
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double 4.0 monotonic, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
+; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s50, -1
+; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s48, s48, s9
+; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0
+; GFX7LESS-NEXT: s_mov_b32 s33, s8
+; GFX7LESS-NEXT: s_mov_b32 s40, s7
+; GFX7LESS-NEXT: s_mov_b32 s41, s6
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3]
+; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s46, -1
+; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
+; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX7LESS-NEXT: s_mov_b32 s12, s41
+; GFX7LESS-NEXT: s_mov_b32 s13, s40
+; GFX7LESS-NEXT: s_mov_b32 s14, s33
+; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0
+; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0
+; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX7LESS-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
+; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX7LESS-NEXT: s_mov_b32 s12, s41
+; GFX7LESS-NEXT: s_mov_b32 s13, s40
+; GFX7LESS-NEXT: s_mov_b32 s14, s33
+; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43]
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_1
+; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s50, -1
+; GFX9-NEXT: s_mov_b32 s51, 0xe00000
+; GFX9-NEXT: s_add_u32 s48, s48, s9
+; GFX9-NEXT: s_addc_u32 s49, s49, 0
+; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3]
+; GFX9-NEXT: s_mov_b32 s33, s8
+; GFX9-NEXT: s_add_u32 s8, s36, 44
+; GFX9-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX9-NEXT: s_mov_b32 s40, s7
+; GFX9-NEXT: s_mov_b32 s41, s6
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-NEXT: s_mov_b32 s12, s41
+; GFX9-NEXT: s_mov_b32 s13, s40
+; GFX9-NEXT: s_mov_b32 s14, s33
+; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT: s_movk_i32 s32, 0x800
+; GFX9-NEXT: v_mov_b32_e32 v41, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
+; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX9-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-NEXT: s_add_u32 s8, s36, 44
+; GFX9-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-NEXT: s_mov_b32 s12, s41
+; GFX9-NEXT: s_mov_b32 s13, s40
+; GFX9-NEXT: s_mov_b32 s14, s33
+; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT: v_mov_b32_e32 v2, s42
+; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX9-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s43
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-NEXT: s_cbranch_execnz .LBB11_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s50, -1
+; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s48, s48, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1064-NEXT: s_addc_u32 s49, s49, 0
+; GFX1064-NEXT: s_mov_b32 s33, s8
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1064-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1064-NEXT: s_mov_b32 s40, s7
+; GFX1064-NEXT: s_mov_b32 s41, s6
+; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s12, s41
+; GFX1064-NEXT: s_mov_b32 s13, s40
+; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-NEXT: v_mov_b32_e32 v41, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
+; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1064-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-NEXT: v_mov_b32_e32 v2, s42
+; GFX1064-NEXT: v_mov_b32_e32 v3, s43
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-NEXT: s_mov_b32 s12, s41
+; GFX1064-NEXT: s_mov_b32 s13, s40
+; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1064-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-NEXT: s_clause 0x1
+; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-NEXT: s_cbranch_execnz .LBB11_1
+; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s50, -1
+; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s48, s48, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1032-NEXT: s_addc_u32 s49, s49, 0
+; GFX1032-NEXT: s_mov_b32 s33, s8
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1032-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1032-NEXT: s_mov_b32 s40, s7
+; GFX1032-NEXT: s_mov_b32 s41, s6
+; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s12, s41
+; GFX1032-NEXT: s_mov_b32 s13, s40
+; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: v_mov_b32_e32 v41, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
+; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1032-NEXT: s_mov_b32 s44, 0
+; GFX1032-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-NEXT: v_mov_b32_e32 v2, s42
+; GFX1032-NEXT: v_mov_b32_e32 v3, s43
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-NEXT: s_mov_b32 s12, s41
+; GFX1032-NEXT: s_mov_b32 s13, s40
+; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1032-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-NEXT: s_clause 0x1
+; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-NEXT: s_cbranch_execnz .LBB11_1
+; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT: s_mov_b32 s33, s8
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1164-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT: s_mov_b32 s12, s6
+; GFX1164-NEXT: s_mov_b32 s13, s7
+; GFX1164-NEXT: s_mov_b32 s14, s33
+; GFX1164-NEXT: s_mov_b32 s32, 32
+; GFX1164-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-NEXT: s_mov_b32 s40, s7
+; GFX1164-NEXT: s_mov_b32 s41, s6
+; GFX1164-NEXT: v_mov_b32_e32 v41, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43]
+; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1164-NEXT: s_mov_b64 s[44:45], 0
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT: s_mov_b32 s12, s41
+; GFX1164-NEXT: s_mov_b32 s13, s40
+; GFX1164-NEXT: s_mov_b32 s14, s33
+; GFX1164-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off
+; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v2, s42
+; GFX1164-NEXT: v_mov_b32_e32 v3, s43
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off
+; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-NEXT: s_cbranch_execnz .LBB11_1
+; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1132-NEXT: s_mov_b32 s40, s14
+; GFX1132-NEXT: s_mov_b32 s41, s13
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_mov_b32 s13, s14
+; GFX1132-NEXT: s_mov_b32 s14, s15
+; GFX1132-NEXT: s_mov_b32 s32, 32
+; GFX1132-NEXT: s_mov_b32 s33, s15
+; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43]
+; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1132-NEXT: s_mov_b32 s44, 0
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-NEXT: v_mov_b32_e32 v31, v40
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT: s_mov_b32 s12, s41
+; GFX1132-NEXT: s_mov_b32 s13, s40
+; GFX1132-NEXT: s_mov_b32 s14, s33
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off
+; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off
+; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-NEXT: s_cbranch_execnz .LBB11_1
+; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s50, -1
+; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9
+; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
+; GFX9-DPP-NEXT: s_mov_b32 s33, s8
+; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_mov_b32 s40, s7
+; GFX9-DPP-NEXT: s_mov_b32 s41, s6
+; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-DPP-NEXT: s_mov_b32 s12, s41
+; GFX9-DPP-NEXT: s_mov_b32 s13, s40
+; GFX9-DPP-NEXT: s_mov_b32 s14, s33
+; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
+; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-DPP-NEXT: s_mov_b32 s12, s41
+; GFX9-DPP-NEXT: s_mov_b32 s13, s40
+; GFX9-DPP-NEXT: s_mov_b32 s14, s33
+; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1
+; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s33, s8
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: s_mov_b32 s40, s7
+; GFX1064-DPP-NEXT: s_mov_b32 s41, s6
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
+; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-DPP-NEXT: s_clause 0x1
+; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1
+; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s33, s8
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: s_mov_b32 s40, s7
+; GFX1032-DPP-NEXT: s_mov_b32 s41, s6
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
+; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
+; GFX1032-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-DPP-NEXT: s_clause 0x1
+; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1
+; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1164-DPP-NEXT: s_mov_b32 s33, s8
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-DPP-NEXT: s_mov_b32 s40, s7
+; GFX1164-DPP-NEXT: s_mov_b32 s41, s6
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43]
+; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1164-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off
+; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1
+; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fmax_double_uni_address_div_value_defalut_scope_unsafe:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 s40, s14
+; GFX1132-DPP-NEXT: s_mov_b32 s41, s13
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43]
+; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1132-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[0:1], v[41:42]
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off
+; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1
+; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-DPP-NEXT: s_endpgm
+ %divValue = call double @div.double.value()
+ %result = atomicrmw fmax ptr addrspace(1) %ptr, double %divValue monotonic, align 4
+ ret void
+}
+
attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
!llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
index 314c52a..49d415c 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fmin.ll
@@ -13,6 +13,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
declare float @div.float.value()
+declare float @div.double.value()
define amdgpu_kernel void @global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
; GFX7LESS-LABEL: global_atomic_fmin_uni_address_uni_value_agent_scope_unsafe:
@@ -3550,6 +3551,3965 @@ define amdgpu_kernel void @global_atomic_fmin_uni_address_div_value_defalut_scop
ret void
}
+define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
+; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s40, s40, s3
+; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB6_3
+; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_mov_b32 s33, s2
+; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1
+; GFX7LESS-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1]
+; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], 4.0
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
+; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:12
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:8
+; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
+; GFX7LESS-NEXT: s_mov_b32 s12, s33
+; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB6_2
+; GFX7LESS-NEXT: .LBB6_3:
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s42, -1
+; GFX9-NEXT: s_mov_b32 s43, 0xe00000
+; GFX9-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: s_add_u32 s40, s40, s3
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: s_addc_u32 s41, s41, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_movk_i32 s32, 0x800
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB6_3
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX9-NEXT: s_mov_b32 s33, s2
+; GFX9-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX9-NEXT: s_mov_b32 s12, s33
+; GFX9-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX9-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, s36
+; GFX9-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-NEXT: v_mov_b32_e32 v3, s37
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-NEXT: s_cbranch_execnz .LBB6_2
+; GFX9-NEXT: .LBB6_3:
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s42, -1
+; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: s_add_u32 s40, s40, s3
+; GFX1064-NEXT: s_addc_u32 s41, s41, 0
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1064-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_cbranch_execz .LBB6_3
+; GFX1064-NEXT: ; %bb.1:
+; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1064-NEXT: s_mov_b32 s33, s2
+; GFX1064-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v2, s1
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1064-NEXT: s_mov_b32 s12, s33
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: v_mov_b32_e32 v2, s36
+; GFX1064-NEXT: v_mov_b32_e32 v3, s37
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1064-NEXT: s_clause 0x1
+; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-NEXT: s_cbranch_execnz .LBB6_2
+; GFX1064-NEXT: .LBB6_3:
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: v_mov_b32_e32 v40, v0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s42, -1
+; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_add_u32 s40, s40, s3
+; GFX1032-NEXT: s_addc_u32 s41, s41, 0
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1032-NEXT: s_mov_b32 s38, 0
+; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB6_3
+; GFX1032-NEXT: ; %bb.1:
+; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1032-NEXT: s_mov_b32 s33, s2
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v2, s1
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1032-NEXT: s_mov_b32 s12, s33
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1032-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-NEXT: v_mov_b32_e32 v2, s36
+; GFX1032-NEXT: v_mov_b32_e32 v3, s37
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1032-NEXT: s_clause 0x1
+; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-NEXT: s_cbranch_execnz .LBB6_2
+; GFX1032-NEXT: .LBB6_3:
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1164-NEXT: s_mov_b32 s32, 32
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_cbranch_execz .LBB6_3
+; GFX1164-NEXT: ; %bb.1:
+; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1164-NEXT: s_mov_b32 s33, s2
+; GFX1164-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: v_mov_b32_e32 v2, s1
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-NEXT: s_mov_b32 s12, s33
+; GFX1164-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v2, s36
+; GFX1164-NEXT: v_mov_b32_e32 v3, s37
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-NEXT: s_cbranch_execnz .LBB6_2
+; GFX1164-NEXT: .LBB6_3:
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: v_mov_b32_e32 v40, v0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1132-NEXT: s_mov_b32 s38, 0
+; GFX1132-NEXT: s_mov_b32 s32, 32
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_cbranch_execz .LBB6_3
+; GFX1132-NEXT: ; %bb.1:
+; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1132-NEXT: s_mov_b32 s33, s15
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-NEXT: s_mov_b32 s12, s33
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
+; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-NEXT: s_cbranch_execnz .LBB6_2
+; GFX1132-NEXT: .LBB6_3:
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s42, -1
+; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX9-DPP-NEXT: s_mov_b32 s33, s2
+; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX9-DPP-NEXT: s_mov_b32 s12, s33
+; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB6_2
+; GFX9-DPP-NEXT: .LBB6_3:
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s42, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3
+; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1064-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1064-DPP-NEXT: s_clause 0x1
+; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB6_2
+; GFX1064-DPP-NEXT: .LBB6_3:
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s42, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3
+; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1032-DPP-NEXT: s_mov_b32 s38, 0
+; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1032-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1032-DPP-NEXT: s_clause 0x1
+; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB6_2
+; GFX1032-DPP-NEXT: .LBB6_3:
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1164-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB6_2
+; GFX1164-DPP-NEXT: .LBB6_3:
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1132-DPP-NEXT: s_mov_b32 s38, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB6_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: .LBB6_2: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB6_2
+; GFX1132-DPP-NEXT: .LBB6_3:
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-DPP-NEXT: s_endpgm
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
+; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s50, -1
+; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s48, s48, s9
+; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0
+; GFX7LESS-NEXT: s_mov_b32 s33, s8
+; GFX7LESS-NEXT: s_mov_b32 s40, s7
+; GFX7LESS-NEXT: s_mov_b32 s41, s6
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3]
+; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s46, -1
+; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
+; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX7LESS-NEXT: s_mov_b32 s12, s41
+; GFX7LESS-NEXT: s_mov_b32 s13, s40
+; GFX7LESS-NEXT: s_mov_b32 s14, s33
+; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0
+; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0
+; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX7LESS-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
+; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
+; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX7LESS-NEXT: s_mov_b32 s12, s41
+; GFX7LESS-NEXT: s_mov_b32 s13, s40
+; GFX7LESS-NEXT: s_mov_b32 s14, s33
+; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43]
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB7_1
+; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s50, -1
+; GFX9-NEXT: s_mov_b32 s51, 0xe00000
+; GFX9-NEXT: s_add_u32 s48, s48, s9
+; GFX9-NEXT: s_addc_u32 s49, s49, 0
+; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3]
+; GFX9-NEXT: s_mov_b32 s33, s8
+; GFX9-NEXT: s_add_u32 s8, s36, 44
+; GFX9-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX9-NEXT: s_mov_b32 s40, s7
+; GFX9-NEXT: s_mov_b32 s41, s6
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-NEXT: s_mov_b32 s12, s41
+; GFX9-NEXT: s_mov_b32 s13, s40
+; GFX9-NEXT: s_mov_b32 s14, s33
+; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT: s_movk_i32 s32, 0x800
+; GFX9-NEXT: v_mov_b32_e32 v41, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
+; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX9-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-NEXT: s_add_u32 s8, s36, 44
+; GFX9-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-NEXT: s_mov_b32 s12, s41
+; GFX9-NEXT: s_mov_b32 s13, s40
+; GFX9-NEXT: s_mov_b32 s14, s33
+; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT: v_mov_b32_e32 v2, s42
+; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX9-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s43
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-NEXT: s_cbranch_execnz .LBB7_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s50, -1
+; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s48, s48, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1064-NEXT: s_addc_u32 s49, s49, 0
+; GFX1064-NEXT: s_mov_b32 s33, s8
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1064-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1064-NEXT: s_mov_b32 s40, s7
+; GFX1064-NEXT: s_mov_b32 s41, s6
+; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s12, s41
+; GFX1064-NEXT: s_mov_b32 s13, s40
+; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-NEXT: v_mov_b32_e32 v41, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
+; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1064-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-NEXT: v_mov_b32_e32 v2, s42
+; GFX1064-NEXT: v_mov_b32_e32 v3, s43
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-NEXT: s_mov_b32 s12, s41
+; GFX1064-NEXT: s_mov_b32 s13, s40
+; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1064-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-NEXT: s_clause 0x1
+; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-NEXT: s_cbranch_execnz .LBB7_1
+; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s50, -1
+; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s48, s48, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1032-NEXT: s_addc_u32 s49, s49, 0
+; GFX1032-NEXT: s_mov_b32 s33, s8
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1032-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1032-NEXT: s_mov_b32 s40, s7
+; GFX1032-NEXT: s_mov_b32 s41, s6
+; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s12, s41
+; GFX1032-NEXT: s_mov_b32 s13, s40
+; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: v_mov_b32_e32 v41, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
+; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1032-NEXT: s_mov_b32 s44, 0
+; GFX1032-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-NEXT: v_mov_b32_e32 v2, s42
+; GFX1032-NEXT: v_mov_b32_e32 v3, s43
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-NEXT: s_mov_b32 s12, s41
+; GFX1032-NEXT: s_mov_b32 s13, s40
+; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1032-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-NEXT: s_clause 0x1
+; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-NEXT: s_cbranch_execnz .LBB7_1
+; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT: s_mov_b32 s33, s8
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1164-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT: s_mov_b32 s12, s6
+; GFX1164-NEXT: s_mov_b32 s13, s7
+; GFX1164-NEXT: s_mov_b32 s14, s33
+; GFX1164-NEXT: s_mov_b32 s32, 32
+; GFX1164-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-NEXT: s_mov_b32 s40, s7
+; GFX1164-NEXT: s_mov_b32 s41, s6
+; GFX1164-NEXT: v_mov_b32_e32 v41, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43]
+; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1164-NEXT: s_mov_b64 s[44:45], 0
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT: s_mov_b32 s12, s41
+; GFX1164-NEXT: s_mov_b32 s13, s40
+; GFX1164-NEXT: s_mov_b32 s14, s33
+; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off
+; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v2, s42
+; GFX1164-NEXT: v_mov_b32_e32 v3, s43
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off
+; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-NEXT: s_cbranch_execnz .LBB7_1
+; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1132-NEXT: s_mov_b32 s40, s14
+; GFX1132-NEXT: s_mov_b32 s41, s13
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_mov_b32 s13, s14
+; GFX1132-NEXT: s_mov_b32 s14, s15
+; GFX1132-NEXT: s_mov_b32 s32, 32
+; GFX1132-NEXT: s_mov_b32 s33, s15
+; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43]
+; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1132-NEXT: s_mov_b32 s44, 0
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-NEXT: v_mov_b32_e32 v31, v40
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT: s_mov_b32 s12, s41
+; GFX1132-NEXT: s_mov_b32 s13, s40
+; GFX1132-NEXT: s_mov_b32 s14, s33
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off
+; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off
+; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-NEXT: s_cbranch_execnz .LBB7_1
+; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s50, -1
+; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9
+; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
+; GFX9-DPP-NEXT: s_mov_b32 s33, s8
+; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_mov_b32 s40, s7
+; GFX9-DPP-NEXT: s_mov_b32 s41, s6
+; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-DPP-NEXT: s_mov_b32 s12, s41
+; GFX9-DPP-NEXT: s_mov_b32 s13, s40
+; GFX9-DPP-NEXT: s_mov_b32 s14, s33
+; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
+; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-DPP-NEXT: s_mov_b32 s12, s41
+; GFX9-DPP-NEXT: s_mov_b32 s13, s40
+; GFX9-DPP-NEXT: s_mov_b32 s14, s33
+; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB7_1
+; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s33, s8
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: s_mov_b32 s40, s7
+; GFX1064-DPP-NEXT: s_mov_b32 s41, s6
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
+; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-DPP-NEXT: s_clause 0x1
+; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB7_1
+; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s33, s8
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: s_mov_b32 s40, s7
+; GFX1032-DPP-NEXT: s_mov_b32 s41, s6
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
+; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
+; GFX1032-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-DPP-NEXT: s_clause 0x1
+; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB7_1
+; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1164-DPP-NEXT: s_mov_b32 s33, s8
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-DPP-NEXT: s_mov_b32 s40, s7
+; GFX1164-DPP-NEXT: s_mov_b32 s41, s6
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43]
+; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1164-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off
+; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB7_1
+; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 s40, s14
+; GFX1132-DPP-NEXT: s_mov_b32 s41, s13
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43]
+; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: .LBB7_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1132-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off
+; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB7_1
+; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-DPP-NEXT: s_endpgm
+ %divValue = call double @div.double.value()
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB8_3
+; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s7
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[4:7], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[4:5], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v4
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v5
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB8_2
+; GFX7LESS-NEXT: .LBB8_3:
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB8_3
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB8_2
+; GFX9-NEXT: .LBB8_3:
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-NEXT: s_cbranch_execz .LBB8_3
+; GFX1064-NEXT: ; %bb.1:
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB8_2
+; GFX1064-NEXT: .LBB8_3:
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB8_3
+; GFX1032-NEXT: ; %bb.1:
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB8_2
+; GFX1032-NEXT: .LBB8_3:
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_cbranch_execz .LBB8_3
+; GFX1164-NEXT: ; %bb.1:
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB8_2
+; GFX1164-NEXT: .LBB8_3:
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_cbranch_execz .LBB8_3
+; GFX1132-NEXT: ; %bb.1:
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB8_2
+; GFX1132-NEXT: .LBB8_3:
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB8_2
+; GFX9-DPP-NEXT: .LBB8_3:
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB8_2
+; GFX1064-DPP-NEXT: .LBB8_3:
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s3, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB8_2
+; GFX1032-DPP-NEXT: .LBB8_3:
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB8_2
+; GFX1164-DPP-NEXT: .LBB8_3:
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_one_as_scope_unsafe:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB8_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: .LBB8_2: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], 4.0
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v4, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB8_2
+; GFX1132-DPP-NEXT: .LBB8_3:
+; GFX1132-DPP-NEXT: s_endpgm
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[36:39], 0
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX7LESS-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_1
+; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s38, -1
+; GFX9-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-NEXT: s_add_u32 s36, s36, s9
+; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b32 s14, s8
+; GFX9-NEXT: s_add_u32 s8, s2, 44
+; GFX9-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_getpc_b64 s[2:3]
+; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: v_mov_b32_e32 v40, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX9-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execnz .LBB9_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s38, -1
+; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-NEXT: s_mov_b32 s14, s8
+; GFX1064-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT: s_mov_b32 s12, s6
+; GFX1064-NEXT: s_mov_b32 s13, s7
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT: s_mov_b32 s32, 0
+; GFX1064-NEXT: v_mov_b32_e32 v40, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX1064-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
+; GFX1064-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s38, -1
+; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-NEXT: s_mov_b32 s14, s8
+; GFX1032-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT: s_mov_b32 s12, s6
+; GFX1032-NEXT: s_mov_b32 s13, s7
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT: s_mov_b32 s32, 0
+; GFX1032-NEXT: v_mov_b32_e32 v40, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX1032-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1032-NEXT: s_mov_b32 s0, 0
+; GFX1032-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b32 s14, s8
+; GFX1164-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT: s_mov_b32 s12, s6
+; GFX1164-NEXT: s_mov_b32 s13, s7
+; GFX1164-NEXT: s_mov_b32 s32, 0
+; GFX1164-NEXT: v_mov_b32_e32 v40, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT: global_load_b64 v[2:3], v40, s[34:35]
+; GFX1164-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
+; GFX1164-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1132-NEXT: s_mov_b32 s13, s14
+; GFX1132-NEXT: s_mov_b32 s14, s15
+; GFX1132-NEXT: s_mov_b32 s32, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-NEXT: global_load_b64 v[2:3], v40, s[34:35]
+; GFX1132-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1132-NEXT: s_mov_b32 s0, 0
+; GFX1132-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s38, -1
+; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b32 s14, s8
+; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT: s_mov_b32 s12, s6
+; GFX9-DPP-NEXT: s_mov_b32 s13, s7
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT: s_mov_b32 s32, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX9-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_1
+; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX1064-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX1064-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v40, s[34:35]
+; GFX1032-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
+; GFX1032-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
+; GFX1164-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX1164-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_one_as_scope_unsafe:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v40, s[34:35]
+; GFX1132-DPP-NEXT: v_max_f64 v[4:5], v[0:1], v[0:1]
+; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
+; GFX1132-DPP-NEXT: .LBB9_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[4:5]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v40, v[0:3], s[34:35] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_1
+; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_endpgm
+ %divValue = call double @div.double.value()
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
+; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s40, s40, s3
+; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, exec_lo, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, exec_hi, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB10_3
+; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_mov_b32 s33, s2
+; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s0
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, s1
+; GFX7LESS-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_max_f64 v[2:3], v[0:1], v[0:1]
+; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_min_f64 v[0:1], v[2:3], 4.0
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
+; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:12
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0 offset:8
+; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
+; GFX7LESS-NEXT: s_mov_b32 s12, s33
+; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_2
+; GFX7LESS-NEXT: .LBB10_3:
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s42, -1
+; GFX9-NEXT: s_mov_b32 s43, 0xe00000
+; GFX9-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-NEXT: s_add_u32 s40, s40, s3
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-NEXT: s_addc_u32 s41, s41, 0
+; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_movk_i32 s32, 0x800
+; GFX9-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB10_3
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX9-NEXT: s_mov_b32 s33, s2
+; GFX9-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX9-NEXT: s_mov_b32 s12, s33
+; GFX9-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX9-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, s36
+; GFX9-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-NEXT: v_mov_b32_e32 v3, s37
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-NEXT: s_cbranch_execnz .LBB10_2
+; GFX9-NEXT: .LBB10_3:
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s42, -1
+; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-NEXT: s_add_u32 s40, s40, s3
+; GFX1064-NEXT: s_addc_u32 s41, s41, 0
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1064-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_cbranch_execz .LBB10_3
+; GFX1064-NEXT: ; %bb.1:
+; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1064-NEXT: s_mov_b32 s33, s2
+; GFX1064-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v2, s1
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1064-NEXT: s_mov_b32 s12, s33
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1064-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: v_mov_b32_e32 v2, s36
+; GFX1064-NEXT: v_mov_b32_e32 v3, s37
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1064-NEXT: s_clause 0x1
+; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1064-NEXT: .LBB10_3:
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: v_mov_b32_e32 v40, v0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s42, -1
+; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_add_u32 s40, s40, s3
+; GFX1032-NEXT: s_addc_u32 s41, s41, 0
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1032-NEXT: s_mov_b32 s38, 0
+; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB10_3
+; GFX1032-NEXT: ; %bb.1:
+; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1032-NEXT: s_mov_b32 s33, s2
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v2, s1
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1032-NEXT: s_mov_b32 s12, s33
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1032-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-NEXT: v_mov_b32_e32 v2, s36
+; GFX1032-NEXT: v_mov_b32_e32 v3, s37
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1032-NEXT: s_clause 0x1
+; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1032-NEXT: .LBB10_3:
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1164-NEXT: s_mov_b32 s32, 32
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_cbranch_execz .LBB10_3
+; GFX1164-NEXT: ; %bb.1:
+; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1164-NEXT: s_mov_b32 s33, s2
+; GFX1164-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: v_mov_b32_e32 v2, s1
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-NEXT: s_mov_b32 s12, s33
+; GFX1164-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v2, s36
+; GFX1164-NEXT: v_mov_b32_e32 v3, s37
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1164-NEXT: .LBB10_3:
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: v_mov_b32_e32 v40, v0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1132-NEXT: s_mov_b32 s38, 0
+; GFX1132-NEXT: s_mov_b32 s32, 32
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_cbranch_execz .LBB10_3
+; GFX1132-NEXT: ; %bb.1:
+; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1132-NEXT: s_mov_b32 s33, s15
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-NEXT: s_mov_b32 s12, s33
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
+; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1132-NEXT: .LBB10_3:
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s42, -1
+; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX9-DPP-NEXT: s_mov_b32 s33, s2
+; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX9-DPP-NEXT: s_mov_b32 s12, s33
+; GFX9-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX9-DPP-NEXT: .LBB10_3:
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s42, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3
+; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1064-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1064-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1064-DPP-NEXT: s_clause 0x1
+; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1064-DPP-NEXT: .LBB10_3:
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s42, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3
+; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1032-DPP-NEXT: s_mov_b32 s38, 0
+; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1032-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1032-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1032-DPP-NEXT: s_clause 0x1
+; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1032-DPP-NEXT: .LBB10_3:
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, exec_hi, v0
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1164-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1164-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1164-DPP-NEXT: .LBB10_3:
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_uni_value_defalut_scope_unsafe:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, exec_lo, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1132-DPP-NEXT: s_mov_b32 s38, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB10_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: .LBB10_2: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_max_f64 v[3:4], v[1:2], v[1:2]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_4)
+; GFX1132-DPP-NEXT: v_min_f64 v[3:4], v[3:4], 4.0
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_2
+; GFX1132-DPP-NEXT: .LBB10_3:
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-DPP-NEXT: s_endpgm
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double 4.0 monotonic, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
+; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s50, -1
+; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s48, s48, s9
+; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0
+; GFX7LESS-NEXT: s_mov_b32 s33, s8
+; GFX7LESS-NEXT: s_mov_b32 s40, s7
+; GFX7LESS-NEXT: s_mov_b32 s41, s6
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3]
+; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s46, -1
+; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
+; GFX7LESS-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v40, v0, v2
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX7LESS-NEXT: s_mov_b32 s12, s41
+; GFX7LESS-NEXT: s_mov_b32 s13, s40
+; GFX7LESS-NEXT: s_mov_b32 s14, s33
+; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[2:3], off, s[44:47], 0
+; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0
+; GFX7LESS-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX7LESS-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
+; GFX7LESS-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
+; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX7LESS-NEXT: s_mov_b32 s12, s41
+; GFX7LESS-NEXT: s_mov_b32 s13, s40
+; GFX7LESS-NEXT: s_mov_b32 s14, s33
+; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX7LESS-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX7LESS-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX7LESS-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43]
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_1
+; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s50, -1
+; GFX9-NEXT: s_mov_b32 s51, 0xe00000
+; GFX9-NEXT: s_add_u32 s48, s48, s9
+; GFX9-NEXT: s_addc_u32 s49, s49, 0
+; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3]
+; GFX9-NEXT: s_mov_b32 s33, s8
+; GFX9-NEXT: s_add_u32 s8, s36, 44
+; GFX9-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX9-NEXT: s_mov_b32 s40, s7
+; GFX9-NEXT: s_mov_b32 s41, s6
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX9-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-NEXT: s_mov_b32 s12, s41
+; GFX9-NEXT: s_mov_b32 s13, s40
+; GFX9-NEXT: s_mov_b32 s14, s33
+; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT: s_movk_i32 s32, 0x800
+; GFX9-NEXT: v_mov_b32_e32 v41, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
+; GFX9-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX9-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-NEXT: s_add_u32 s8, s36, 44
+; GFX9-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-NEXT: s_mov_b32 s12, s41
+; GFX9-NEXT: s_mov_b32 s13, s40
+; GFX9-NEXT: s_mov_b32 s14, s33
+; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT: v_mov_b32_e32 v2, s42
+; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX9-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX9-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v3, s43
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX9-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-NEXT: s_cbranch_execnz .LBB11_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s50, -1
+; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s48, s48, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1064-NEXT: s_addc_u32 s49, s49, 0
+; GFX1064-NEXT: s_mov_b32 s33, s8
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1064-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1064-NEXT: s_mov_b32 s40, s7
+; GFX1064-NEXT: s_mov_b32 s41, s6
+; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s12, s41
+; GFX1064-NEXT: s_mov_b32 s13, s40
+; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-NEXT: v_mov_b32_e32 v41, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
+; GFX1064-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1064-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-NEXT: v_mov_b32_e32 v2, s42
+; GFX1064-NEXT: v_mov_b32_e32 v3, s43
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-NEXT: s_mov_b32 s12, s41
+; GFX1064-NEXT: s_mov_b32 s13, s40
+; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1064-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1064-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-NEXT: s_clause 0x1
+; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX1064-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-NEXT: s_cbranch_execnz .LBB11_1
+; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s50, -1
+; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s48, s48, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1032-NEXT: s_addc_u32 s49, s49, 0
+; GFX1032-NEXT: s_mov_b32 s33, s8
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1032-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1032-NEXT: s_mov_b32 s40, s7
+; GFX1032-NEXT: s_mov_b32 s41, s6
+; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s12, s41
+; GFX1032-NEXT: s_mov_b32 s13, s40
+; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: v_mov_b32_e32 v41, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
+; GFX1032-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1032-NEXT: s_mov_b32 s44, 0
+; GFX1032-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-NEXT: v_mov_b32_e32 v2, s42
+; GFX1032-NEXT: v_mov_b32_e32 v3, s43
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-NEXT: s_mov_b32 s12, s41
+; GFX1032-NEXT: s_mov_b32 s13, s40
+; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1032-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1032-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-NEXT: s_clause 0x1
+; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX1032-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-NEXT: s_cbranch_execnz .LBB11_1
+; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT: s_mov_b32 s33, s8
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1164-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT: s_mov_b32 s12, s6
+; GFX1164-NEXT: s_mov_b32 s13, s7
+; GFX1164-NEXT: s_mov_b32 s14, s33
+; GFX1164-NEXT: s_mov_b32 s32, 32
+; GFX1164-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-NEXT: s_mov_b32 s40, s7
+; GFX1164-NEXT: s_mov_b32 s41, s6
+; GFX1164-NEXT: v_mov_b32_e32 v41, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-NEXT: global_load_b64 v[2:3], v41, s[42:43]
+; GFX1164-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1164-NEXT: s_mov_b64 s[44:45], 0
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT: s_mov_b32 s12, s41
+; GFX1164-NEXT: s_mov_b32 s13, s40
+; GFX1164-NEXT: s_mov_b32 s14, s33
+; GFX1164-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b64 off, v[2:3], off
+; GFX1164-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v2, s42
+; GFX1164-NEXT: v_mov_b32_e32 v3, s43
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-NEXT: scratch_load_b64 v[2:3], off, off
+; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-NEXT: s_cbranch_execnz .LBB11_1
+; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1132-NEXT: s_mov_b32 s40, s14
+; GFX1132-NEXT: s_mov_b32 s41, s13
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_mov_b32 s13, s14
+; GFX1132-NEXT: s_mov_b32 s14, s15
+; GFX1132-NEXT: s_mov_b32 s32, 32
+; GFX1132-NEXT: s_mov_b32 s33, s15
+; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-NEXT: global_load_b64 v[2:3], v41, s[42:43]
+; GFX1132-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1132-NEXT: s_mov_b32 s44, 0
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-NEXT: v_mov_b32_e32 v31, v40
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT: s_mov_b32 s12, s41
+; GFX1132-NEXT: s_mov_b32 s13, s40
+; GFX1132-NEXT: s_mov_b32 s14, s33
+; GFX1132-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b64 off, v[2:3], off
+; GFX1132-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-NEXT: scratch_load_b64 v[2:3], off, off
+; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-NEXT: s_cbranch_execnz .LBB11_1
+; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s50, -1
+; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9
+; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
+; GFX9-DPP-NEXT: s_mov_b32 s33, s8
+; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_mov_b32 s40, s7
+; GFX9-DPP-NEXT: s_mov_b32 s41, s6
+; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX9-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-DPP-NEXT: s_mov_b32 s12, s41
+; GFX9-DPP-NEXT: s_mov_b32 s13, s40
+; GFX9-DPP-NEXT: s_mov_b32 s14, s33
+; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX9-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
+; GFX9-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-DPP-NEXT: s_mov_b32 s12, s41
+; GFX9-DPP-NEXT: s_mov_b32 s13, s40
+; GFX9-DPP-NEXT: s_mov_b32 s14, s33
+; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX9-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX9-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_1
+; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s33, s8
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: s_mov_b32 s40, s7
+; GFX1064-DPP-NEXT: s_mov_b32 s41, s6
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
+; GFX1064-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1064-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-DPP-NEXT: s_clause 0x1
+; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX1064-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_1
+; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s33, s8
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: s_mov_b32 s40, s7
+; GFX1032-DPP-NEXT: s_mov_b32 s41, s6
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-DPP-NEXT: v_or3_b32 v40, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[2:3], v41, s[42:43]
+; GFX1032-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
+; GFX1032-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:12
+; GFX1032-DPP-NEXT: buffer_store_dword v0, off, s[48:51], 0 offset:8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-DPP-NEXT: s_clause 0x1
+; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0
+; GFX1032-DPP-NEXT: buffer_load_dword v3, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_1
+; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1164-DPP-NEXT: s_mov_b32 s33, s8
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-DPP-NEXT: s_mov_b32 s40, s7
+; GFX1164-DPP-NEXT: s_mov_b32 s41, s6
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43]
+; GFX1164-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1164-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1164-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[2:3], off
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT: scratch_load_b64 v[2:3], off, off
+; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_1
+; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fmin_double_uni_address_div_value_defalut_scope_unsafe:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.double.value@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.double.value@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 s40, s14
+; GFX1132-DPP-NEXT: s_mov_b32 s41, s13
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT: global_load_b64 v[2:3], v41, s[42:43]
+; GFX1132-DPP-NEXT: v_max_f64 v[41:42], v[0:1], v[0:1]
+; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: .LBB11_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_max_f64 v[0:1], v[2:3], v[2:3]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1132-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1132-DPP-NEXT: v_min_f64 v[0:1], v[0:1], v[41:42]
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[2:3], off
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[0:1], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s42 :: v_dual_mov_b32 v3, s43
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT: scratch_load_b64 v[2:3], off, off
+; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_1
+; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-DPP-NEXT: s_endpgm
+ %divValue = call double @div.double.value()
+ %result = atomicrmw fmin ptr addrspace(1) %ptr, double %divValue monotonic, align 4
+ ret void
+}
+
attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
!llvm.module.flags = !{!0}
diff --git a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
index bc9125e..7a7ddbe 100644
--- a/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
+++ b/llvm/test/CodeGen/AMDGPU/global_atomics_scan_fsub.ll
@@ -13,6 +13,7 @@
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -amdgpu-atomic-optimizer-strategy=DPP -verify-machineinstrs < %s | FileCheck -enable-var-scope -check-prefixes=GFX1132-DPP %s
declare float @div.float.value()
+declare double @div.double.value()
define amdgpu_kernel void @global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
; GFX7LESS-LABEL: global_atomic_fsub_uni_address_uni_value_agent_scope_unsafe:
@@ -5616,6 +5617,5581 @@ define amdgpu_kernel void @global_atomic_fsub_uni_address_div_value_defalut_scop
ret void
}
+define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
+; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s40, s40, s3
+; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s33, s2
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB9_3
+; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_load_dwordx2 s[2:3], s[36:37], 0x0
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX7LESS-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX7LESS-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3
+; GFX7LESS-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42]
+; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX7LESS-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
+; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX7LESS-NEXT: s_waitcnt expcnt(2)
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
+; GFX7LESS-NEXT: s_mov_b32 s12, s33
+; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB9_2
+; GFX7LESS-NEXT: .LBB9_3:
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s42, -1
+; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: s_mov_b32 s43, 0xe00000
+; GFX9-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX9-NEXT: s_add_u32 s40, s40, s3
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX9-NEXT: s_addc_u32 s41, s41, 0
+; GFX9-NEXT: s_mov_b32 s33, s2
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_movk_i32 s32, 0x800
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB9_3
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX9-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX9-NEXT: s_mov_b32 s12, s33
+; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX9-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, s36
+; GFX9-NEXT: v_mov_b32_e32 v3, s37
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-NEXT: s_cbranch_execnz .LBB9_2
+; GFX9-NEXT: .LBB9_3:
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s42, -1
+; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s40, s40, s3
+; GFX1064-NEXT: s_mov_b32 s33, s2
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: s_addc_u32 s41, s41, 0
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1064-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_cbranch_execz .LBB9_3
+; GFX1064-NEXT: ; %bb.1:
+; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1064-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX1064-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1064-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v2, s1
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1064-NEXT: s_mov_b32 s12, s33
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: v_mov_b32_e32 v2, s36
+; GFX1064-NEXT: v_mov_b32_e32 v3, s37
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1064-NEXT: s_clause 0x1
+; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1064-NEXT: .LBB9_3:
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s33, s2
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032-NEXT: v_mov_b32_e32 v40, v0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s42, -1
+; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_add_u32 s40, s40, s3
+; GFX1032-NEXT: s_addc_u32 s41, s41, 0
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1032-NEXT: s_mov_b32 s38, 0
+; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB9_3
+; GFX1032-NEXT: ; %bb.1:
+; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s2
+; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1032-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1032-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v2, s1
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1032-NEXT: s_mov_b32 s12, s33
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-NEXT: v_mov_b32_e32 v2, s36
+; GFX1032-NEXT: v_mov_b32_e32 v3, s37
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1032-NEXT: s_clause 0x1
+; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1032-NEXT: .LBB9_3:
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b32 s33, s2
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1164-NEXT: s_mov_b32 s32, 32
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-NEXT: s_cbranch_execz .LBB9_3
+; GFX1164-NEXT: ; %bb.1:
+; GFX1164-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1164-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX1164-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: v_mov_b32_e32 v2, s1
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-NEXT: s_mov_b32 s12, s33
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v2, s36
+; GFX1164-NEXT: v_mov_b32_e32 v3, s37
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1164-NEXT: .LBB9_3:
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-NEXT: v_mov_b32_e32 v40, v0
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1132-NEXT: s_mov_b32 s38, 0
+; GFX1132-NEXT: s_mov_b32 s32, 32
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-NEXT: s_cbranch_execz .LBB9_3
+; GFX1132-NEXT: ; %bb.1:
+; GFX1132-NEXT: s_bcnt1_i32_b32 s0, s2
+; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1132-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX1132-NEXT: s_mov_b32 s33, s15
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-NEXT: s_mov_b32 s12, s33
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
+; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1132-NEXT: .LBB9_3:
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s42, -1
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX9-DPP-NEXT: s_mov_b32 s33, s2
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX9-DPP-NEXT: s_mov_b32 s12, s33
+; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX9-DPP-NEXT: .LBB9_3:
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s42, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3
+; GFX1064-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1064-DPP-NEXT: s_clause 0x1
+; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1064-DPP-NEXT: .LBB9_3:
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s42, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3
+; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1032-DPP-NEXT: s_mov_b32 s38, 0
+; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s2
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1032-DPP-NEXT: s_clause 0x1
+; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1032-DPP-NEXT: .LBB9_3:
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1164-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1164-DPP-NEXT: .LBB9_3:
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_unsafe:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_mov_b32 s2, exec_lo
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1132-DPP-NEXT: s_mov_b32 s38, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v0
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB9_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, s2
+; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1132-DPP-NEXT: v_cvt_f64_u32_e32 v[0:1], s0
+; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], v[0:1], 4.0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: .LBB9_2: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB9_2
+; GFX1132-DPP-NEXT: .LBB9_3:
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-DPP-NEXT: s_endpgm
+ %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
+; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s50, -1
+; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s48, s48, s9
+; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0
+; GFX7LESS-NEXT: s_mov_b32 s33, s8
+; GFX7LESS-NEXT: s_mov_b32 s40, s7
+; GFX7LESS-NEXT: s_mov_b32 s41, s6
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3]
+; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s46, -1
+; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
+; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX7LESS-NEXT: s_mov_b32 s12, s41
+; GFX7LESS-NEXT: s_mov_b32 s13, s40
+; GFX7LESS-NEXT: s_mov_b32 s14, s33
+; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0
+; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0
+; GFX7LESS-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[40:41]
+; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
+; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
+; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: s_waitcnt expcnt(2)
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX7LESS-NEXT: s_mov_b32 s12, s41
+; GFX7LESS-NEXT: s_mov_b32 s13, s40
+; GFX7LESS-NEXT: s_mov_b32 s14, s33
+; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43]
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB10_1
+; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s50, -1
+; GFX9-NEXT: s_mov_b32 s51, 0xe00000
+; GFX9-NEXT: s_add_u32 s48, s48, s9
+; GFX9-NEXT: s_addc_u32 s49, s49, 0
+; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3]
+; GFX9-NEXT: s_mov_b32 s33, s8
+; GFX9-NEXT: s_add_u32 s8, s36, 44
+; GFX9-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX9-NEXT: s_mov_b32 s40, s7
+; GFX9-NEXT: s_mov_b32 s41, s6
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-NEXT: s_mov_b32 s12, s41
+; GFX9-NEXT: s_mov_b32 s13, s40
+; GFX9-NEXT: s_mov_b32 s14, s33
+; GFX9-NEXT: v_mov_b32_e32 v31, v42
+; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT: s_movk_i32 s32, 0x800
+; GFX9-NEXT: v_mov_b32_e32 v43, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v41, v1
+; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
+; GFX9-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX9-NEXT: s_add_u32 s8, s36, 44
+; GFX9-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
+; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-NEXT: s_mov_b32 s12, s41
+; GFX9-NEXT: s_mov_b32 s13, s40
+; GFX9-NEXT: s_mov_b32 s14, s33
+; GFX9-NEXT: v_mov_b32_e32 v31, v42
+; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, s42
+; GFX9-NEXT: v_mov_b32_e32 v3, s43
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0
+; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-NEXT: s_cbranch_execnz .LBB10_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s50, -1
+; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s48, s48, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1064-NEXT: s_addc_u32 s49, s49, 0
+; GFX1064-NEXT: s_mov_b32 s33, s8
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1064-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1064-NEXT: s_mov_b32 s40, s7
+; GFX1064-NEXT: s_mov_b32 s41, s6
+; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s12, s41
+; GFX1064-NEXT: s_mov_b32 s13, s40
+; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT: v_mov_b32_e32 v31, v42
+; GFX1064-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-NEXT: v_mov_b32_e32 v43, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-NEXT: v_mov_b32_e32 v41, v1
+; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
+; GFX1064-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v31, v42
+; GFX1064-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: v_mov_b32_e32 v2, s42
+; GFX1064-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-NEXT: s_mov_b32 s12, s41
+; GFX1064-NEXT: s_mov_b32 s13, s40
+; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
+; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1064-NEXT: v_mov_b32_e32 v3, s43
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-NEXT: s_clause 0x1
+; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0
+; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-NEXT: s_cbranch_execnz .LBB10_1
+; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s50, -1
+; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s48, s48, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1032-NEXT: s_addc_u32 s49, s49, 0
+; GFX1032-NEXT: s_mov_b32 s33, s8
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1032-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1032-NEXT: s_mov_b32 s40, s7
+; GFX1032-NEXT: s_mov_b32 s41, s6
+; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s12, s41
+; GFX1032-NEXT: s_mov_b32 s13, s40
+; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT: v_mov_b32_e32 v31, v42
+; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: v_mov_b32_e32 v43, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-NEXT: v_mov_b32_e32 v41, v1
+; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
+; GFX1032-NEXT: v_mov_b32_e32 v40, v0
+; GFX1032-NEXT: s_mov_b32 s44, 0
+; GFX1032-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v31, v42
+; GFX1032-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-NEXT: v_mov_b32_e32 v2, s42
+; GFX1032-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-NEXT: s_mov_b32 s12, s41
+; GFX1032-NEXT: s_mov_b32 s13, s40
+; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
+; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1032-NEXT: v_mov_b32_e32 v3, s43
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-NEXT: s_clause 0x1
+; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0
+; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-NEXT: s_cbranch_execnz .LBB10_1
+; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT: s_mov_b32 s33, s8
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1164-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT: s_mov_b32 s12, s6
+; GFX1164-NEXT: s_mov_b32 s13, s7
+; GFX1164-NEXT: s_mov_b32 s14, s33
+; GFX1164-NEXT: s_mov_b32 s32, 32
+; GFX1164-NEXT: v_mov_b32_e32 v42, v0
+; GFX1164-NEXT: s_mov_b32 s40, s7
+; GFX1164-NEXT: s_mov_b32 s41, s6
+; GFX1164-NEXT: v_mov_b32_e32 v43, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-NEXT: v_mov_b32_e32 v41, v1
+; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43]
+; GFX1164-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-NEXT: s_mov_b64 s[44:45], 0
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-NEXT: v_mov_b32_e32 v31, v42
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT: s_mov_b32 s12, s41
+; GFX1164-NEXT: s_mov_b32 s13, s40
+; GFX1164-NEXT: s_mov_b32 s14, s33
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v2, s42
+; GFX1164-NEXT: v_mov_b32_e32 v3, s43
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-NEXT: s_cbranch_execnz .LBB10_1
+; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1132-NEXT: s_mov_b32 s40, s14
+; GFX1132-NEXT: s_mov_b32 s41, s13
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_mov_b32 s13, s14
+; GFX1132-NEXT: s_mov_b32 s14, s15
+; GFX1132-NEXT: s_mov_b32 s32, 32
+; GFX1132-NEXT: s_mov_b32 s33, s15
+; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
+; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43]
+; GFX1132-NEXT: s_mov_b32 s44, 0
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT: s_mov_b32 s12, s41
+; GFX1132-NEXT: s_mov_b32 s13, s40
+; GFX1132-NEXT: s_mov_b32 s14, s33
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
+; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-NEXT: s_cbranch_execnz .LBB10_1
+; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s50, -1
+; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9
+; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
+; GFX9-DPP-NEXT: s_mov_b32 s33, s8
+; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_mov_b32 s40, s7
+; GFX9-DPP-NEXT: s_mov_b32 s41, s6
+; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-DPP-NEXT: s_mov_b32 s12, s41
+; GFX9-DPP-NEXT: s_mov_b32 s13, s40
+; GFX9-DPP-NEXT: s_mov_b32 s14, s33
+; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1
+; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
+; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-DPP-NEXT: s_mov_b32 s12, s41
+; GFX9-DPP-NEXT: s_mov_b32 s13, s40
+; GFX9-DPP-NEXT: s_mov_b32 s14, s33
+; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
+; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB10_1
+; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s33, s8
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: s_mov_b32 s40, s7
+; GFX1064-DPP-NEXT: s_mov_b32 s41, s6
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
+; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-DPP-NEXT: s_clause 0x1
+; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB10_1
+; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s33, s8
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: s_mov_b32 s40, s7
+; GFX1032-DPP-NEXT: s_mov_b32 s41, s6
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
+; GFX1032-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
+; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-DPP-NEXT: s_clause 0x1
+; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB10_1
+; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1164-DPP-NEXT: s_mov_b32 s33, s8
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0
+; GFX1164-DPP-NEXT: s_mov_b32 s40, s7
+; GFX1164-DPP-NEXT: s_mov_b32 s41, s6
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1
+; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1164-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB10_1
+; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_align4_unsafe:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 s40, s14
+; GFX1132-DPP-NEXT: s_mov_b32 s41, s13
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
+; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43]
+; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: .LBB10_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1132-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB10_1
+; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-DPP-NEXT: s_endpgm
+ %divValue = call double @div.float.value()
+ %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
+; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s14, -1
+; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s12, s12, s3
+; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB11_3
+; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: .LBB11_2: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB11_2
+; GFX7LESS-NEXT: .LBB11_3:
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-NEXT: s_add_u32 s8, s8, s3
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB11_3
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX9-NEXT: s_mov_b32 s3, 0x43300000
+; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-NEXT: .LBB11_2: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB11_2
+; GFX9-NEXT: .LBB11_3:
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s10, -1
+; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s8, s8, s3
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: s_addc_u32 s9, s9, 0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_cbranch_execz .LBB11_3
+; GFX1064-NEXT: ; %bb.1:
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: .LBB11_2: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1064-NEXT: .LBB11_3:
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s10, -1
+; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s8, s8, s3
+; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB11_3
+; GFX1032-NEXT: ; %bb.1:
+; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
+; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: .LBB11_2: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1032-NEXT: .LBB11_3:
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1164-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4
+; GFX1164-NEXT: scratch_store_b32 off, v1, off
+; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT: s_cbranch_execz .LBB11_3
+; GFX1164-NEXT: ; %bb.1:
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: .LBB11_2: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1164-NEXT: .LBB11_3:
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
+; GFX1132-NEXT: scratch_store_b32 off, v1, off
+; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT: s_cbranch_execz .LBB11_3
+; GFX1132-NEXT: ; %bb.1:
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: .LBB11_2: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1132-NEXT: .LBB11_3:
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s10, -1
+; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX9-DPP-NEXT: .LBB11_3:
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s10, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1064-DPP-NEXT: .LBB11_3:
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
+; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1032-DPP-NEXT: .LBB11_3:
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off
+; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1164-DPP-NEXT: .LBB11_3:
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_one_as_scope_unsafe_structfp:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
+; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB11_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: .LBB11_2: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB11_2
+; GFX1132-DPP-NEXT: .LBB11_3:
+; GFX1132-DPP-NEXT: s_endpgm
+ %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 syncscope("one-as") monotonic
+ ret void
+}
+define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
+; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB12_1
+; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s38, -1
+; GFX9-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-NEXT: s_add_u32 s36, s36, s9
+; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b32 s14, s8
+; GFX9-NEXT: s_add_u32 s8, s2, 44
+; GFX9-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_getpc_b64 s[2:3]
+; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: v_mov_b32_e32 v40, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execnz .LBB12_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s38, -1
+; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-NEXT: s_mov_b32 s14, s8
+; GFX1064-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT: s_mov_b32 s12, s6
+; GFX1064-NEXT: s_mov_b32 s13, s7
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT: s_mov_b32 s32, 0
+; GFX1064-NEXT: v_mov_b32_e32 v40, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
+; GFX1064-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1064-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-NEXT: v_mov_b32_e32 v4, v2
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execnz .LBB12_1
+; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s38, -1
+; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-NEXT: s_mov_b32 s14, s8
+; GFX1032-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT: s_mov_b32 s12, s6
+; GFX1032-NEXT: s_mov_b32 s13, s7
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT: s_mov_b32 s32, 0
+; GFX1032-NEXT: v_mov_b32_e32 v40, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1032-NEXT: s_mov_b32 s0, 0
+; GFX1032-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1032-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-NEXT: v_mov_b32_e32 v4, v2
+; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execnz .LBB12_1
+; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b32 s14, s8
+; GFX1164-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT: s_mov_b32 s12, s6
+; GFX1164-NEXT: s_mov_b32 s13, s7
+; GFX1164-NEXT: s_mov_b32 s32, 0
+; GFX1164-NEXT: v_mov_b32_e32 v40, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
+; GFX1164-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1164-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execnz .LBB12_1
+; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1132-NEXT: s_mov_b32 s13, s14
+; GFX1132-NEXT: s_mov_b32 s14, s15
+; GFX1132-NEXT: s_mov_b32 s32, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1132-NEXT: s_mov_b32 s0, 0
+; GFX1132-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execnz .LBB12_1
+; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s38, -1
+; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b32 s14, s8
+; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT: s_mov_b32 s12, s6
+; GFX9-DPP-NEXT: s_mov_b32 s13, s7
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT: s_mov_b32 s32, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB12_1
+; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX1064-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB12_1
+; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
+; GFX1032-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB12_1
+; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX1164-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB12_1
+; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_one_as_scope_unsafe_structfp:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
+; GFX1132-DPP-NEXT: .LBB12_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB12_1
+; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_endpgm
+ %divValue = call double @div.double.value() strictfp
+ %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("one-as") monotonic
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp(ptr addrspace(1) %ptr) #2{
+; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s14, -1
+; GFX7LESS-NEXT: s_mov_b32 s15, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s12, s12, s3
+; GFX7LESS-NEXT: s_addc_u32 s13, s13, 0
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], exec
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s2, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s3, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB13_3
+; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x9
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s6, s[2:3]
+; GFX7LESS-NEXT: s_mov_b32 s7, 0x43300000
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_load_dwordx2 s[8:9], s[0:1], 0x0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], 0
+; GFX7LESS-NEXT: s_mov_b32 s3, 0xf000
+; GFX7LESS-NEXT: v_add_f64 v[0:1], s[6:7], v[0:1]
+; GFX7LESS-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s8
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s9
+; GFX7LESS-NEXT: s_mov_b32 s2, -1
+; GFX7LESS-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v2
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v1
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v0
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[0:3], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7LESS-NEXT: s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[4:5]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB13_2
+; GFX7LESS-NEXT: .LBB13_3:
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s10, -1
+; GFX9-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-NEXT: s_add_u32 s8, s8, s3
+; GFX9-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB13_3
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX9-NEXT: s_mov_b32 s3, 0x43300000
+; GFX9-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-NEXT: s_cbranch_execnz .LBB13_2
+; GFX9-NEXT: .LBB13_3:
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s10, -1
+; GFX1064-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s8, s8, s3
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: s_addc_u32 s9, s9, 0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-NEXT: s_cbranch_execz .LBB13_3
+; GFX1064-NEXT: ; %bb.1:
+; GFX1064-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s3, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-NEXT: s_cbranch_execnz .LBB13_2
+; GFX1064-NEXT: .LBB13_3:
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s10, -1
+; GFX1032-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s8, s8, s3
+; GFX1032-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-NEXT: s_mov_b32 s2, 0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB13_3
+; GFX1032-NEXT: ; %bb.1:
+; GFX1032-NEXT: s_bcnt1_i32_b32 s4, s3
+; GFX1032-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-NEXT: s_cbranch_execnz .LBB13_2
+; GFX1032-NEXT: .LBB13_3:
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_bcnt1_i32_b64 s2, exec
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1164-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:4
+; GFX1164-NEXT: scratch_store_b32 off, v1, off
+; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT: s_cbranch_execz .LBB13_3
+; GFX1164-NEXT: ; %bb.1:
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-NEXT: s_cbranch_execnz .LBB13_2
+; GFX1164-NEXT: .LBB13_3:
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT: s_mov_b32 s2, 0
+; GFX1132-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:4
+; GFX1132-NEXT: scratch_store_b32 off, v1, off
+; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT: s_cbranch_execz .LBB13_3
+; GFX1132-NEXT: ; %bb.1:
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-NEXT: s_cbranch_execnz .LBB13_2
+; GFX1132-NEXT: .LBB13_3:
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s10, -1
+; GFX9-DPP-NEXT: s_mov_b32 s11, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX9-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB13_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX9-DPP-NEXT: s_mov_b32 s3, 0x43300000
+; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[2:3], v[0:1]
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX9-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX9-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB13_2
+; GFX9-DPP-NEXT: .LBB13_3:
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s10, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s11, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB13_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s2, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s3, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1064-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[2:3]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB13_2
+; GFX1064-DPP-NEXT: .LBB13_3:
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s8, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s9, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s10, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s11, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s8, s8, s3
+; GFX1032-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s9, 0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s4, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB13_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s4, s3
+; GFX1032-DPP-NEXT: s_mov_b32 s5, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[4:5]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s4
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s5
+; GFX1032-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1032-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s2
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB13_2
+; GFX1032-DPP-NEXT: .LBB13_3:
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s2, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s2
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off
+; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB13_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1164-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s2
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s3
+; GFX1164-DPP-NEXT: s_mov_b64 s[2:3], 0
+; GFX1164-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, v1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[2:3], vcc, s[2:3]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[2:3]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB13_2
+; GFX1164-DPP-NEXT: .LBB13_3:
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_agent_scope_strictfp:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s2, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v0, 0x43300000 :: v_dual_mov_b32 v1, s2
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s2, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s3, exec_lo
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:4
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off
+; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB13_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_load_b64 s[4:5], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GFX1132-DPP-NEXT: v_mul_f64 v[4:5], 4.0, v[0:1]
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5
+; GFX1132-DPP-NEXT: .LBB13_2: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[0:1], v[2:3], -v[4:5]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[0:1], v6, v[0:3], s[0:1] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[0:1], v[2:3]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_mov_b32 v2, v0
+; GFX1132-DPP-NEXT: s_or_b32 s2, vcc_lo, s2
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s2
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB13_2
+; GFX1132-DPP-NEXT: .LBB13_3:
+; GFX1132-DPP-NEXT: s_endpgm
+ %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 syncscope("agent") monotonic
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe(ptr addrspace(1) %ptr) #0 {
+; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB14_1
+; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s38, -1
+; GFX9-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-NEXT: s_add_u32 s36, s36, s9
+; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b32 s14, s8
+; GFX9-NEXT: s_add_u32 s8, s2, 44
+; GFX9-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_getpc_b64 s[2:3]
+; GFX9-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: v_mov_b32_e32 v40, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execnz .LBB14_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s38, -1
+; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-NEXT: s_mov_b32 s14, s8
+; GFX1064-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT: s_mov_b32 s12, s6
+; GFX1064-NEXT: s_mov_b32 s13, s7
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT: s_mov_b32 s32, 0
+; GFX1064-NEXT: v_mov_b32_e32 v40, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
+; GFX1064-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1064-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-NEXT: v_mov_b32_e32 v4, v2
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execnz .LBB14_1
+; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s38, -1
+; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-NEXT: s_mov_b32 s14, s8
+; GFX1032-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT: s_mov_b32 s12, s6
+; GFX1032-NEXT: s_mov_b32 s13, s7
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT: s_mov_b32 s32, 0
+; GFX1032-NEXT: v_mov_b32_e32 v40, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1032-NEXT: s_mov_b32 s0, 0
+; GFX1032-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1032-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-NEXT: v_mov_b32_e32 v4, v2
+; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execnz .LBB14_1
+; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b32 s14, s8
+; GFX1164-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT: s_mov_b32 s12, s6
+; GFX1164-NEXT: s_mov_b32 s13, s7
+; GFX1164-NEXT: s_mov_b32 s32, 0
+; GFX1164-NEXT: v_mov_b32_e32 v40, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
+; GFX1164-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1164-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execnz .LBB14_1
+; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1132-NEXT: s_mov_b32 s13, s14
+; GFX1132-NEXT: s_mov_b32 s14, s15
+; GFX1132-NEXT: s_mov_b32 s32, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1132-NEXT: s_mov_b32 s0, 0
+; GFX1132-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execnz .LBB14_1
+; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s38, -1
+; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b32 s14, s8
+; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.double.value@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.double.value@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT: s_mov_b32 s12, s6
+; GFX9-DPP-NEXT: s_mov_b32 s13, s7
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT: s_mov_b32 s32, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB14_1
+; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX1064-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB14_1
+; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
+; GFX1032-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB14_1
+; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX1164-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB14_1
+; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.double.value@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.double.value@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
+; GFX1132-DPP-NEXT: .LBB14_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB14_1
+; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_endpgm
+ %divValue = call double @div.double.value()
+ %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp(ptr addrspace(1) %ptr) #1 {
+; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_mov_b32 s32, 0
+; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s40, s40, s9
+; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s14, s8
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s39, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s38, -1
+; GFX7LESS-NEXT: s_add_u32 s8, s2, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s3, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[2:3]
+; GFX7LESS-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v31, v0, v2
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX7LESS-NEXT: s_mov_b32 s12, s6
+; GFX7LESS-NEXT: s_mov_b32 s13, s7
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], 0
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], 0
+; GFX7LESS-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v9, v5
+; GFX7LESS-NEXT: v_mov_b32_e32 v8, v4
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, v3
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, v2
+; GFX7LESS-NEXT: buffer_atomic_cmpswap_x2 v[6:9], off, s[36:39], 0 glc
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_cmp_eq_u64_e32 vcc, v[6:7], v[4:5]
+; GFX7LESS-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, v6
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, v7
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB15_1
+; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s38, -1
+; GFX9-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-NEXT: s_add_u32 s36, s36, s9
+; GFX9-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-NEXT: s_mov_b32 s14, s8
+; GFX9-NEXT: s_add_u32 s8, s2, 44
+; GFX9-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-NEXT: s_getpc_b64 s[2:3]
+; GFX9-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-NEXT: s_mov_b32 s12, s6
+; GFX9-NEXT: s_mov_b32 s13, s7
+; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-NEXT: s_mov_b32 s32, 0
+; GFX9-NEXT: v_mov_b32_e32 v40, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX9-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX9-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX9-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT: s_cbranch_execnz .LBB15_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s38, -1
+; GFX1064-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-NEXT: s_mov_b32 s14, s8
+; GFX1064-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-NEXT: s_mov_b32 s12, s6
+; GFX1064-NEXT: s_mov_b32 s13, s7
+; GFX1064-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-NEXT: s_mov_b32 s32, 0
+; GFX1064-NEXT: v_mov_b32_e32 v40, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1064-NEXT: s_mov_b64 s[0:1], 0
+; GFX1064-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1064-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1064-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-NEXT: v_mov_b32_e32 v4, v2
+; GFX1064-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-NEXT: s_cbranch_execnz .LBB15_1
+; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s38, -1
+; GFX1032-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-NEXT: s_mov_b32 s14, s8
+; GFX1032-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-NEXT: s_mov_b32 s12, s6
+; GFX1032-NEXT: s_mov_b32 s13, s7
+; GFX1032-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-NEXT: s_mov_b32 s32, 0
+; GFX1032-NEXT: v_mov_b32_e32 v40, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1032-NEXT: s_mov_b32 s0, 0
+; GFX1032-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1032-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1032-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-NEXT: v_mov_b32_e32 v4, v2
+; GFX1032-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-NEXT: s_cbranch_execnz .LBB15_1
+; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b32 s14, s8
+; GFX1164-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1164-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1164-NEXT: s_mov_b32 s12, s6
+; GFX1164-NEXT: s_mov_b32 s13, s7
+; GFX1164-NEXT: s_mov_b32 s32, 0
+; GFX1164-NEXT: v_mov_b32_e32 v40, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1164-NEXT: s_mov_b64 s[0:1], 0
+; GFX1164-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1164-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1164-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-NEXT: s_cbranch_execnz .LBB15_1
+; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1132-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1132-NEXT: s_mov_b32 s13, s14
+; GFX1132-NEXT: s_mov_b32 s14, s15
+; GFX1132-NEXT: s_mov_b32 s32, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1132-NEXT: s_mov_b32 s0, 0
+; GFX1132-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1132-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1132-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-NEXT: s_cbranch_execnz .LBB15_1
+; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s38, -1
+; GFX9-DPP-NEXT: s_mov_b32 s39, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX9-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX9-DPP-NEXT: s_mov_b32 s14, s8
+; GFX9-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX9-DPP-NEXT: s_getpc_b64 s[2:3]
+; GFX9-DPP-NEXT: s_add_u32 s2, s2, div.float.value@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s3, s3, div.float.value@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[16:17], s[2:3], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX9-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX9-DPP-NEXT: s_mov_b32 s12, s6
+; GFX9-DPP-NEXT: s_mov_b32 s13, s7
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX9-DPP-NEXT: s_mov_b32 s32, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX9-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX9-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX9-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX9-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB15_1
+; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s39, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1064-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1064-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1064-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1064-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1064-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX1064-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1064-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1064-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[0:1]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB15_1
+; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s36, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s38, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s39, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s36, s36, s9
+; GFX1032-DPP-NEXT: s_addc_u32 s37, s37, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1032-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1032-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1032-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[16:17], s[4:5], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[34:35], s[2:3], 0x24
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[36:37]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1032-DPP-NEXT: v_or3_b32 v31, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[4:5], v40, s[34:35]
+; GFX1032-DPP-NEXT: s_mov_b32 s0, 0
+; GFX1032-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1032-DPP-NEXT: global_atomic_cmpswap_x2 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1032-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s0
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB15_1
+; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s8
+; GFX1164-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1164-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1164-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_load_b64 s[16:17], s[4:5], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[16:17]
+; GFX1164-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], 0
+; GFX1164-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1164-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_cmp_eq_u64_e32 vcc, v[2:3], v[4:5]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, v3
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, v2
+; GFX1164-DPP-NEXT: s_or_b64 s[0:1], vcc, s[0:1]
+; GFX1164-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[0:1]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB15_1
+; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_agent_scope_unsafe_structfp:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_add_u32 s8, s2, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s3, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[4:5]
+; GFX1132-DPP-NEXT: s_getpc_b64 s[4:5]
+; GFX1132-DPP-NEXT: s_add_u32 s4, s4, div.float.value@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s5, s5, div.float.value@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: s_load_b64 s[6:7], s[4:5], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[34:35], s[2:3], 0x24
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, 0 :: v_dual_mov_b32 v31, v0
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[0:1]
+; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1132-DPP-NEXT: global_load_b64 v[4:5], v40, s[34:35]
+; GFX1132-DPP-NEXT: s_mov_b32 s0, 0
+; GFX1132-DPP-NEXT: .LBB15_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_add_f64 v[2:3], v[4:5], -v[0:1]
+; GFX1132-DPP-NEXT: global_atomic_cmpswap_b64 v[2:3], v40, v[2:5], s[34:35] glc
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[4:5]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v4, v2
+; GFX1132-DPP-NEXT: s_or_b32 s0, vcc_lo, s0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s0
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB15_1
+; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_endpgm
+ %divValue = call double @div.float.value() strictfp
+ %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue syncscope("agent") monotonic
+ ret void
+}
+define amdgpu_kernel void @global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 {
+; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
+; GFX7LESS-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s42, -1
+; GFX7LESS-NEXT: s_mov_b32 s43, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s40, s40, s3
+; GFX7LESS-NEXT: s_addc_u32 s41, s41, 0
+; GFX7LESS-NEXT: s_mov_b32 s33, s2
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], exec
+; GFX7LESS-NEXT: v_mbcnt_lo_u32_b32_e64 v0, s0, 0
+; GFX7LESS-NEXT: v_mbcnt_hi_u32_b32_e32 v0, s1, v0
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX7LESS-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX7LESS-NEXT: s_cbranch_execz .LBB16_3
+; GFX7LESS-NEXT: ; %bb.1:
+; GFX7LESS-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x9
+; GFX7LESS-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX7LESS-NEXT: s_mov_b32 s1, 0x43300000
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_load_dwordx2 s[2:3], s[36:37], 0x0
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX7LESS-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1]
+; GFX7LESS-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX7LESS-NEXT: s_mov_b64 s[38:39], 0
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, s2
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, s3
+; GFX7LESS-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[41:42]
+; GFX7LESS-NEXT: buffer_store_dword v1, off, s[40:43], 0 offset:4
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[40:43], 0
+; GFX7LESS-NEXT: s_add_u32 s8, s34, 44
+; GFX7LESS-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:12
+; GFX7LESS-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:8
+; GFX7LESS-NEXT: s_addc_u32 s9, s35, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
+; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX7LESS-NEXT: s_waitcnt expcnt(2)
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
+; GFX7LESS-NEXT: s_mov_b32 s12, s33
+; GFX7LESS-NEXT: v_mov_b32_e32 v31, v40
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s36
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s37
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[40:43], 0
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[40:43], 0 offset:4
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7LESS-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB16_2
+; GFX7LESS-NEXT: .LBB16_3:
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s42, -1
+; GFX9-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX9-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-NEXT: s_mov_b32 s43, 0xe00000
+; GFX9-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX9-NEXT: s_add_u32 s40, s40, s3
+; GFX9-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX9-NEXT: s_addc_u32 s41, s41, 0
+; GFX9-NEXT: s_mov_b32 s33, s2
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-NEXT: s_movk_i32 s32, 0x800
+; GFX9-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-NEXT: s_cbranch_execz .LBB16_3
+; GFX9-NEXT: ; %bb.1:
+; GFX9-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX9-NEXT: s_mov_b32 s1, 0x43300000
+; GFX9-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1]
+; GFX9-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX9-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX9-NEXT: s_add_u32 s8, s34, 44
+; GFX9-NEXT: s_addc_u32 s9, s35, 0
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX9-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX9-NEXT: s_mov_b32 s12, s33
+; GFX9-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX9-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX9-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, s36
+; GFX9-NEXT: v_mov_b32_e32 v3, s37
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX9-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-NEXT: s_cbranch_execnz .LBB16_2
+; GFX9-NEXT: .LBB16_3:
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s42, -1
+; GFX1064-NEXT: s_mov_b32 s43, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s40, s40, s3
+; GFX1064-NEXT: s_mov_b32 s33, s2
+; GFX1064-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-NEXT: s_addc_u32 s41, s41, 0
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1064-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-NEXT: s_cbranch_execz .LBB16_3
+; GFX1064-NEXT: ; %bb.1:
+; GFX1064-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX1064-NEXT: s_mov_b32 s1, 0x43300000
+; GFX1064-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1064-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
+; GFX1064-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1064-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: v_mov_b32_e32 v2, s1
+; GFX1064-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1064-NEXT: s_mov_b32 s12, s33
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1064-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1064-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1064-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: v_mov_b32_e32 v2, s36
+; GFX1064-NEXT: v_mov_b32_e32 v3, s37
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1064-NEXT: s_clause 0x1
+; GFX1064-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1064-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-NEXT: s_cbranch_execnz .LBB16_2
+; GFX1064-NEXT: .LBB16_3:
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s33, s2
+; GFX1032-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032-NEXT: v_mov_b32_e32 v40, v0
+; GFX1032-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1032-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s42, -1
+; GFX1032-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-NEXT: s_add_u32 s40, s40, s3
+; GFX1032-NEXT: s_addc_u32 s41, s41, 0
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1032-NEXT: s_mov_b32 s38, 0
+; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-NEXT: s_cbranch_execz .LBB16_3
+; GFX1032-NEXT: ; %bb.1:
+; GFX1032-NEXT: s_bcnt1_i32_b32 s0, s2
+; GFX1032-NEXT: s_mov_b32 s1, 0x43300000
+; GFX1032-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1032-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1032-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: v_mov_b32_e32 v2, s1
+; GFX1032-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1032-NEXT: s_mov_b32 s12, s33
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1032-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1032-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1032-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-NEXT: v_mov_b32_e32 v2, s36
+; GFX1032-NEXT: v_mov_b32_e32 v3, s37
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1032-NEXT: s_clause 0x1
+; GFX1032-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1032-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-NEXT: s_cbranch_execnz .LBB16_2
+; GFX1032-NEXT: .LBB16_3:
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1164-NEXT: s_bcnt1_i32_b64 s0, exec
+; GFX1164-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b32 off, v0, off offset:20
+; GFX1164-NEXT: scratch_store_b32 off, v1, off offset:16
+; GFX1164-NEXT: scratch_load_b64 v[0:1], off, off offset:16
+; GFX1164-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-NEXT: s_mov_b32 s32, 32
+; GFX1164-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1164-NEXT: s_cbranch_execz .LBB16_3
+; GFX1164-NEXT: ; %bb.1:
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1164-NEXT: s_mov_b32 s33, s2
+; GFX1164-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: v_mov_b32_e32 v2, s1
+; GFX1164-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-NEXT: s_mov_b32 s12, s33
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v2, s36
+; GFX1164-NEXT: v_mov_b32_e32 v3, s37
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-NEXT: s_cbranch_execnz .LBB16_2
+; GFX1164-NEXT: .LBB16_3:
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1132-NEXT: s_bcnt1_i32_b32 s0, exec_lo
+; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0
+; GFX1132-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1132-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b32 off, v0, off offset:20
+; GFX1132-NEXT: scratch_store_b32 off, v1, off offset:16
+; GFX1132-NEXT: scratch_load_b64 v[0:1], off, off offset:16
+; GFX1132-NEXT: s_mov_b32 s38, 0
+; GFX1132-NEXT: s_mov_b32 s32, 32
+; GFX1132-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1132-NEXT: s_cbranch_execz .LBB16_3
+; GFX1132-NEXT: ; %bb.1:
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1132-NEXT: s_mov_b32 s33, s15
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-NEXT: s_mov_b32 s12, s33
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
+; GFX1132-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-NEXT: s_cbranch_execnz .LBB16_2
+; GFX1132-NEXT: .LBB16_3:
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s42, -1
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX9-DPP-NEXT: s_mov_b32 s43, 0xe00000
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s0, 0
+; GFX9-DPP-NEXT: s_add_u32 s40, s40, s3
+; GFX9-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s1, v0
+; GFX9-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX9-DPP-NEXT: s_mov_b32 s33, s2
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX9-DPP-NEXT: s_and_saveexec_b64 s[2:3], vcc
+; GFX9-DPP-NEXT: s_cbranch_execz .LBB16_3
+; GFX9-DPP-NEXT: ; %bb.1:
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 0
+; GFX9-DPP-NEXT: s_bcnt1_i32_b64 s0, s[0:1]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0xc3300000
+; GFX9-DPP-NEXT: s_mov_b32 s1, 0x43300000
+; GFX9-DPP-NEXT: v_add_f64 v[0:1], s[0:1], v[0:1]
+; GFX9-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX9-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX9-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX9-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX9-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX9-DPP-NEXT: s_mov_b32 s12, s33
+; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB16_2
+; GFX9-DPP-NEXT: .LBB16_3:
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s42, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s43, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s40, s40, s3
+; GFX1064-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], exec
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1064-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-DPP-NEXT: v_mbcnt_hi_u32_b32 v0, s3, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
+; GFX1064-DPP-NEXT: s_and_saveexec_b64 s[0:1], vcc
+; GFX1064-DPP-NEXT: s_cbranch_execz .LBB16_3
+; GFX1064-DPP-NEXT: ; %bb.1:
+; GFX1064-DPP-NEXT: s_bcnt1_i32_b64 s0, s[2:3]
+; GFX1064-DPP-NEXT: s_mov_b32 s1, 0x43300000
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1064-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
+; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1064-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1064-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1064-DPP-NEXT: s_clause 0x1
+; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[38:39]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB16_2
+; GFX1064-DPP-NEXT: .LBB16_3:
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s2, exec_lo
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1032-DPP-NEXT: v_mbcnt_lo_u32_b32 v0, s2, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s40, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s41, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s42, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s43, 0x31c16000
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
+; GFX1032-DPP-NEXT: s_add_u32 s40, s40, s3
+; GFX1032-DPP-NEXT: s_addc_u32 s41, s41, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1032-DPP-NEXT: s_mov_b32 s38, 0
+; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: s_and_saveexec_b32 s0, vcc_lo
+; GFX1032-DPP-NEXT: s_cbranch_execz .LBB16_3
+; GFX1032-DPP-NEXT: ; %bb.1:
+; GFX1032-DPP-NEXT: s_bcnt1_i32_b32 s0, s2
+; GFX1032-DPP-NEXT: s_mov_b32 s1, 0x43300000
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[36:37], s[34:35], 0x24
+; GFX1032-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, s[0:1]
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[0:1], s[36:37], 0x0
+; GFX1032-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1032-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[40:41]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[42:43]
+; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[40:43], 0
+; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[40:43], 0 offset:12
+; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[40:43], 0 offset:8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[4:5]
+; GFX1032-DPP-NEXT: s_clause 0x1
+; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[40:43], 0
+; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[40:43], 0 offset:4
+; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s38
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB16_2
+; GFX1032-DPP-NEXT: .LBB16_3:
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1164-DPP-NEXT: s_bcnt1_i32_b64 s0, exec
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v0, off offset:20
+; GFX1164-DPP-NEXT: scratch_store_b32 off, v1, off offset:16
+; GFX1164-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16
+; GFX1164-DPP-NEXT: v_mbcnt_hi_u32_b32 v2, exec_hi, v2
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1164-DPP-NEXT: s_mov_b64 s[0:1], exec
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1164-DPP-NEXT: s_cbranch_execz .LBB16_3
+; GFX1164-DPP-NEXT: ; %bb.1:
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1164-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1164-DPP-NEXT: s_mov_b32 s33, s2
+; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s1
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, s0
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v40
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s36
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s37
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[38:39], vcc, s[38:39]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[38:39]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB16_2
+; GFX1164-DPP-NEXT: .LBB16_3:
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_uni_value_defalut_scope_strictfp:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[0:1]
+; GFX1132-DPP-NEXT: s_bcnt1_i32_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v1, s0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v0, 0x43300000
+; GFX1132-DPP-NEXT: v_mbcnt_lo_u32_b32 v2, exec_lo, 0
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v0, off offset:20
+; GFX1132-DPP-NEXT: scratch_store_b32 off, v1, off offset:16
+; GFX1132-DPP-NEXT: scratch_load_b64 v[0:1], off, off offset:16
+; GFX1132-DPP-NEXT: s_mov_b32 s38, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1132-DPP-NEXT: s_mov_b32 s0, exec_lo
+; GFX1132-DPP-NEXT: v_cmpx_eq_u32_e32 0, v2
+; GFX1132-DPP-NEXT: s_cbranch_execz .LBB16_3
+; GFX1132-DPP-NEXT: ; %bb.1:
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_add_f64 v[0:1], 0xc3300000, v[0:1]
+; GFX1132-DPP-NEXT: s_load_b64 s[36:37], s[34:35], 0x24
+; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[36:37], 0x0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_mul_f64 v[41:42], 4.0, v[0:1]
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v2, s1 :: v_dual_mov_b32 v1, s0
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: .LBB16_2: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[41:42]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v40 :: v_dual_mov_b32 v0, 8
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s33
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s36
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s37 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-DPP-NEXT: s_or_b32 s38, vcc_lo, s38
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s38
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB16_2
+; GFX1132-DPP-NEXT: .LBB16_3:
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-DPP-NEXT: s_endpgm
+ %result = atomicrmw fsub ptr addrspace(1) %ptr, double 4.0 monotonic, align 4
+ ret void
+}
+
+define amdgpu_kernel void @global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp(ptr addrspace(1) %ptr) #2 {
+; GFX7LESS-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp:
+; GFX7LESS: ; %bb.0:
+; GFX7LESS-NEXT: s_movk_i32 s32, 0x800
+; GFX7LESS-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX7LESS-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX7LESS-NEXT: s_mov_b32 s50, -1
+; GFX7LESS-NEXT: s_mov_b32 s51, 0xe8f000
+; GFX7LESS-NEXT: s_add_u32 s48, s48, s9
+; GFX7LESS-NEXT: s_addc_u32 s49, s49, 0
+; GFX7LESS-NEXT: s_mov_b32 s33, s8
+; GFX7LESS-NEXT: s_mov_b32 s40, s7
+; GFX7LESS-NEXT: s_mov_b32 s41, s6
+; GFX7LESS-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX7LESS-NEXT: s_mov_b64 s[36:37], s[2:3]
+; GFX7LESS-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX7LESS-NEXT: s_load_dwordx2 s[44:45], s[2:3], 0x9
+; GFX7LESS-NEXT: s_mov_b32 s47, 0xf000
+; GFX7LESS-NEXT: s_mov_b32 s46, -1
+; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
+; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
+; GFX7LESS-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX7LESS-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v0, v0, v1
+; GFX7LESS-NEXT: v_or_b32_e32 v42, v0, v2
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX7LESS-NEXT: s_mov_b32 s12, s41
+; GFX7LESS-NEXT: s_mov_b32 s13, s40
+; GFX7LESS-NEXT: s_mov_b32 s14, s33
+; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX7LESS-NEXT: v_mov_b32_e32 v40, v0
+; GFX7LESS-NEXT: v_mov_b32_e32 v41, v1
+; GFX7LESS-NEXT: buffer_load_dwordx2 v[0:1], off, s[44:47], 0
+; GFX7LESS-NEXT: s_mov_b64 s[42:43], 0
+; GFX7LESS-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX7LESS-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX7LESS-NEXT: s_waitcnt vmcnt(0)
+; GFX7LESS-NEXT: v_add_f64 v[2:3], v[0:1], -v[40:41]
+; GFX7LESS-NEXT: buffer_store_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT: buffer_store_dword v0, off, s[48:51], 0
+; GFX7LESS-NEXT: s_add_u32 s8, s36, 44
+; GFX7LESS-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:12
+; GFX7LESS-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:8
+; GFX7LESS-NEXT: s_addc_u32 s9, s37, 0
+; GFX7LESS-NEXT: s_getpc_b64 s[0:1]
+; GFX7LESS-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX7LESS-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX7LESS-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX7LESS-NEXT: s_waitcnt expcnt(2)
+; GFX7LESS-NEXT: v_mov_b32_e32 v0, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v1, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v4, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v5, 8
+; GFX7LESS-NEXT: v_mov_b32_e32 v6, 0
+; GFX7LESS-NEXT: v_mov_b32_e32 v7, 0
+; GFX7LESS-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX7LESS-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX7LESS-NEXT: s_mov_b32 s12, s41
+; GFX7LESS-NEXT: s_mov_b32 s13, s40
+; GFX7LESS-NEXT: s_mov_b32 s14, s33
+; GFX7LESS-NEXT: v_mov_b32_e32 v31, v42
+; GFX7LESS-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX7LESS-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX7LESS-NEXT: s_waitcnt expcnt(0)
+; GFX7LESS-NEXT: v_mov_b32_e32 v2, s44
+; GFX7LESS-NEXT: v_mov_b32_e32 v3, s45
+; GFX7LESS-NEXT: s_waitcnt lgkmcnt(0)
+; GFX7LESS-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX7LESS-NEXT: v_and_b32_e32 v2, 1, v0
+; GFX7LESS-NEXT: buffer_load_dword v0, off, s[48:51], 0
+; GFX7LESS-NEXT: buffer_load_dword v1, off, s[48:51], 0 offset:4
+; GFX7LESS-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2
+; GFX7LESS-NEXT: s_or_b64 s[42:43], vcc, s[42:43]
+; GFX7LESS-NEXT: s_andn2_b64 exec, exec, s[42:43]
+; GFX7LESS-NEXT: s_cbranch_execnz .LBB17_1
+; GFX7LESS-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX7LESS-NEXT: s_endpgm
+;
+; GFX9-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp:
+; GFX9: ; %bb.0:
+; GFX9-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX9-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX9-NEXT: s_mov_b32 s50, -1
+; GFX9-NEXT: s_mov_b32 s51, 0xe00000
+; GFX9-NEXT: s_add_u32 s48, s48, s9
+; GFX9-NEXT: s_addc_u32 s49, s49, 0
+; GFX9-NEXT: s_mov_b64 s[36:37], s[2:3]
+; GFX9-NEXT: s_mov_b32 s33, s8
+; GFX9-NEXT: s_add_u32 s8, s36, 44
+; GFX9-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX9-NEXT: s_mov_b32 s40, s7
+; GFX9-NEXT: s_mov_b32 s41, s6
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX9-NEXT: v_or3_b32 v42, v0, v1, v2
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-NEXT: s_mov_b32 s12, s41
+; GFX9-NEXT: s_mov_b32 s13, s40
+; GFX9-NEXT: s_mov_b32 s14, s33
+; GFX9-NEXT: v_mov_b32_e32 v31, v42
+; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT: s_movk_i32 s32, 0x800
+; GFX9-NEXT: v_mov_b32_e32 v43, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT: v_mov_b32_e32 v41, v1
+; GFX9-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
+; GFX9-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX9-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT: s_waitcnt vmcnt(0)
+; GFX9-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX9-NEXT: s_add_u32 s8, s36, 44
+; GFX9-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-NEXT: s_getpc_b64 s[0:1]
+; GFX9-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX9-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
+; GFX9-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-NEXT: s_mov_b32 s12, s41
+; GFX9-NEXT: s_mov_b32 s13, s40
+; GFX9-NEXT: s_mov_b32 s14, s33
+; GFX9-NEXT: v_mov_b32_e32 v31, v42
+; GFX9-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-NEXT: v_mov_b32_e32 v2, s42
+; GFX9-NEXT: v_mov_b32_e32 v3, s43
+; GFX9-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-NEXT: buffer_load_dword v1, off, s[48:51], 0
+; GFX9-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-NEXT: s_cbranch_execnz .LBB17_1
+; GFX9-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT: s_endpgm
+;
+; GFX1064-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp:
+; GFX1064: ; %bb.0:
+; GFX1064-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1064-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1064-NEXT: s_mov_b32 s50, -1
+; GFX1064-NEXT: s_mov_b32 s51, 0x31e16000
+; GFX1064-NEXT: s_add_u32 s48, s48, s9
+; GFX1064-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1064-NEXT: s_addc_u32 s49, s49, 0
+; GFX1064-NEXT: s_mov_b32 s33, s8
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1064-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1064-NEXT: s_mov_b32 s40, s7
+; GFX1064-NEXT: s_mov_b32 s41, s6
+; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-NEXT: v_or3_b32 v42, v0, v1, v2
+; GFX1064-NEXT: s_mov_b32 s12, s41
+; GFX1064-NEXT: s_mov_b32 s13, s40
+; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT: v_mov_b32_e32 v31, v42
+; GFX1064-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-NEXT: v_mov_b32_e32 v43, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-NEXT: v_mov_b32_e32 v41, v1
+; GFX1064-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
+; GFX1064-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1064-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1064-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-NEXT: v_mov_b32_e32 v31, v42
+; GFX1064-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-NEXT: v_mov_b32_e32 v2, s42
+; GFX1064-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-NEXT: s_mov_b32 s12, s41
+; GFX1064-NEXT: s_mov_b32 s13, s40
+; GFX1064-NEXT: s_mov_b32 s14, s33
+; GFX1064-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
+; GFX1064-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1064-NEXT: v_mov_b32_e32 v3, s43
+; GFX1064-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-NEXT: s_clause 0x1
+; GFX1064-NEXT: buffer_load_dword v1, off, s[48:51], 0
+; GFX1064-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-NEXT: s_cbranch_execnz .LBB17_1
+; GFX1064-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-NEXT: s_endpgm
+;
+; GFX1032-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp:
+; GFX1032: ; %bb.0:
+; GFX1032-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1032-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1032-NEXT: s_mov_b32 s50, -1
+; GFX1032-NEXT: s_mov_b32 s51, 0x31c16000
+; GFX1032-NEXT: s_add_u32 s48, s48, s9
+; GFX1032-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1032-NEXT: s_addc_u32 s49, s49, 0
+; GFX1032-NEXT: s_mov_b32 s33, s8
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1032-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1032-NEXT: s_mov_b32 s40, s7
+; GFX1032-NEXT: s_mov_b32 s41, s6
+; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-NEXT: v_or3_b32 v42, v0, v1, v2
+; GFX1032-NEXT: s_mov_b32 s12, s41
+; GFX1032-NEXT: s_mov_b32 s13, s40
+; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT: v_mov_b32_e32 v31, v42
+; GFX1032-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-NEXT: v_mov_b32_e32 v43, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-NEXT: v_mov_b32_e32 v41, v1
+; GFX1032-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
+; GFX1032-NEXT: v_mov_b32_e32 v40, v0
+; GFX1032-NEXT: s_mov_b32 s44, 0
+; GFX1032-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1032-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1032-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-NEXT: v_mov_b32_e32 v31, v42
+; GFX1032-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-NEXT: v_mov_b32_e32 v2, s42
+; GFX1032-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-NEXT: s_mov_b32 s12, s41
+; GFX1032-NEXT: s_mov_b32 s13, s40
+; GFX1032-NEXT: s_mov_b32 s14, s33
+; GFX1032-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
+; GFX1032-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1032-NEXT: v_mov_b32_e32 v3, s43
+; GFX1032-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-NEXT: s_clause 0x1
+; GFX1032-NEXT: buffer_load_dword v1, off, s[48:51], 0
+; GFX1032-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1032-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-NEXT: s_cbranch_execnz .LBB17_1
+; GFX1032-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-NEXT: s_endpgm
+;
+; GFX1164-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp:
+; GFX1164: ; %bb.0:
+; GFX1164-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1164-NEXT: s_mov_b32 s33, s8
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1164-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT: s_mov_b32 s12, s6
+; GFX1164-NEXT: s_mov_b32 s13, s7
+; GFX1164-NEXT: s_mov_b32 s14, s33
+; GFX1164-NEXT: s_mov_b32 s32, 32
+; GFX1164-NEXT: v_mov_b32_e32 v42, v0
+; GFX1164-NEXT: s_mov_b32 s40, s7
+; GFX1164-NEXT: s_mov_b32 s41, s6
+; GFX1164-NEXT: v_mov_b32_e32 v43, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-NEXT: v_mov_b32_e32 v41, v1
+; GFX1164-NEXT: global_load_b64 v[1:2], v43, s[42:43]
+; GFX1164-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-NEXT: s_mov_b64 s[44:45], 0
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-NEXT: .p2align 6
+; GFX1164-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1164-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1164-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-NEXT: v_mov_b32_e32 v31, v42
+; GFX1164-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-NEXT: s_mov_b32 s12, s41
+; GFX1164-NEXT: s_mov_b32 s13, s40
+; GFX1164-NEXT: s_mov_b32 s14, s33
+; GFX1164-NEXT: s_clause 0x1
+; GFX1164-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-NEXT: v_mov_b32_e32 v2, s42
+; GFX1164-NEXT: v_mov_b32_e32 v3, s43
+; GFX1164-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-NEXT: s_cbranch_execnz .LBB17_1
+; GFX1164-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-NEXT: s_endpgm
+;
+; GFX1132-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp:
+; GFX1132: ; %bb.0:
+; GFX1132-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1132-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1132-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1132-NEXT: s_mov_b32 s40, s14
+; GFX1132-NEXT: s_mov_b32 s41, s13
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT: s_mov_b32 s12, s13
+; GFX1132-NEXT: s_mov_b32 s13, s14
+; GFX1132-NEXT: s_mov_b32 s14, s15
+; GFX1132-NEXT: s_mov_b32 s32, 32
+; GFX1132-NEXT: s_mov_b32 s33, s15
+; GFX1132-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
+; GFX1132-NEXT: global_load_b64 v[1:2], v43, s[42:43]
+; GFX1132-NEXT: s_mov_b32 s44, 0
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-NEXT: .p2align 6
+; GFX1132-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1132-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1132-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8
+; GFX1132-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-NEXT: s_mov_b32 s12, s41
+; GFX1132-NEXT: s_mov_b32 s13, s40
+; GFX1132-NEXT: s_mov_b32 s14, s33
+; GFX1132-NEXT: s_clause 0x1
+; GFX1132-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1132-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
+; GFX1132-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
+; GFX1132-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1132-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-NEXT: s_cbranch_execnz .LBB17_1
+; GFX1132-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-NEXT: s_endpgm
+;
+; GFX9-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp:
+; GFX9-DPP: ; %bb.0:
+; GFX9-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX9-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX9-DPP-NEXT: s_mov_b32 s50, -1
+; GFX9-DPP-NEXT: s_mov_b32 s51, 0xe00000
+; GFX9-DPP-NEXT: s_add_u32 s48, s48, s9
+; GFX9-DPP-NEXT: s_addc_u32 s49, s49, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[36:37], s[2:3]
+; GFX9-DPP-NEXT: s_mov_b32 s33, s8
+; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX9-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_mov_b32 s40, s7
+; GFX9-DPP-NEXT: s_mov_b32 s41, s6
+; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX9-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX9-DPP-NEXT: s_mov_b64 s[34:35], s[4:5]
+; GFX9-DPP-NEXT: v_or3_b32 v42, v0, v1, v2
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-DPP-NEXT: s_mov_b32 s12, s41
+; GFX9-DPP-NEXT: s_mov_b32 s13, s40
+; GFX9-DPP-NEXT: s_mov_b32 s14, s33
+; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX9-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v41, v1
+; GFX9-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX9-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX9-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX9-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX9-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX9-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX9-DPP-NEXT: s_add_u32 s8, s36, 44
+; GFX9-DPP-NEXT: s_addc_u32 s9, s37, 0
+; GFX9-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX9-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX9-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX9-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX9-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX9-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX9-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX9-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
+; GFX9-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX9-DPP-NEXT: s_mov_b64 s[10:11], s[34:35]
+; GFX9-DPP-NEXT: s_mov_b32 s12, s41
+; GFX9-DPP-NEXT: s_mov_b32 s13, s40
+; GFX9-DPP-NEXT: s_mov_b32 s14, s33
+; GFX9-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX9-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX9-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX9-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX9-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX9-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX9-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX9-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX9-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX9-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
+; GFX9-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX9-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX9-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX9-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX9-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX9-DPP-NEXT: s_cbranch_execnz .LBB17_1
+; GFX9-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX9-DPP-NEXT: s_endpgm
+;
+; GFX1064-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp:
+; GFX1064-DPP: ; %bb.0:
+; GFX1064-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1064-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1064-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1064-DPP-NEXT: s_mov_b32 s51, 0x31e16000
+; GFX1064-DPP-NEXT: s_add_u32 s48, s48, s9
+; GFX1064-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1064-DPP-NEXT: s_addc_u32 s49, s49, 0
+; GFX1064-DPP-NEXT: s_mov_b32 s33, s8
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: s_mov_b32 s40, s7
+; GFX1064-DPP-NEXT: s_mov_b32 s41, s6
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1064-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1064-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-DPP-NEXT: v_or3_b32 v42, v0, v1, v2
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX1064-DPP-NEXT: s_movk_i32 s32, 0x800
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v41, v1
+; GFX1064-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1064-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX1064-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1064-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1064-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1064-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1064-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1064-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1064-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1064-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1064-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1064-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1064-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1064-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1064-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1064-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1064-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1064-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1064-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1064-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
+; GFX1064-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX1064-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1064-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1064-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1064-DPP-NEXT: s_clause 0x1
+; GFX1064-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
+; GFX1064-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1064-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1064-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1064-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1064-DPP-NEXT: s_andn2_b64 exec, exec, s[44:45]
+; GFX1064-DPP-NEXT: s_cbranch_execnz .LBB17_1
+; GFX1064-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1064-DPP-NEXT: s_endpgm
+;
+; GFX1032-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp:
+; GFX1032-DPP: ; %bb.0:
+; GFX1032-DPP-NEXT: s_mov_b32 s48, SCRATCH_RSRC_DWORD0
+; GFX1032-DPP-NEXT: s_mov_b32 s49, SCRATCH_RSRC_DWORD1
+; GFX1032-DPP-NEXT: s_mov_b32 s50, -1
+; GFX1032-DPP-NEXT: s_mov_b32 s51, 0x31c16000
+; GFX1032-DPP-NEXT: s_add_u32 s48, s48, s9
+; GFX1032-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1032-DPP-NEXT: s_addc_u32 s49, s49, 0
+; GFX1032-DPP-NEXT: s_mov_b32 s33, s8
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: s_mov_b32 s40, s7
+; GFX1032-DPP-NEXT: s_mov_b32 s41, s6
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[42:43], s[2:3], 0x24
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v2, 20, v2
+; GFX1032-DPP-NEXT: v_lshlrev_b32_e32 v1, 10, v1
+; GFX1032-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-DPP-NEXT: v_or3_b32 v42, v0, v1, v2
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX1032-DPP-NEXT: s_movk_i32 s32, 0x400
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v41, v1
+; GFX1032-DPP-NEXT: global_load_dwordx2 v[1:2], v43, s[42:43]
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1032-DPP-NEXT: s_mov_b32 s44, 0
+; GFX1032-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1032-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1032-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1032-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1032-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1032-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1032-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1032-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1032-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1032-DPP-NEXT: buffer_store_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT: buffer_store_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1032-DPP-NEXT: s_mov_b64 s[0:1], s[48:49]
+; GFX1032-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1032-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1032-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1032-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1032-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1032-DPP-NEXT: s_mov_b64 s[2:3], s[50:51]
+; GFX1032-DPP-NEXT: buffer_store_dword v4, off, s[48:51], 0 offset:12
+; GFX1032-DPP-NEXT: buffer_store_dword v3, off, s[48:51], 0 offset:8
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX1032-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1032-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1032-DPP-NEXT: s_swappc_b64 s[30:31], s[6:7]
+; GFX1032-DPP-NEXT: s_clause 0x1
+; GFX1032-DPP-NEXT: buffer_load_dword v1, off, s[48:51], 0
+; GFX1032-DPP-NEXT: buffer_load_dword v2, off, s[48:51], 0 offset:4
+; GFX1032-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1032-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1032-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1032-DPP-NEXT: s_andn2_b32 exec_lo, exec_lo, s44
+; GFX1032-DPP-NEXT: s_cbranch_execnz .LBB17_1
+; GFX1032-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1032-DPP-NEXT: s_endpgm
+;
+; GFX1164-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp:
+; GFX1164-DPP: ; %bb.0:
+; GFX1164-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1164-DPP-NEXT: s_mov_b32 s33, s8
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1164-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s6
+; GFX1164-DPP-NEXT: s_mov_b32 s13, s7
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1164-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v42, v0
+; GFX1164-DPP-NEXT: s_mov_b32 s40, s7
+; GFX1164-DPP-NEXT: s_mov_b32 s41, s6
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v43, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v41, v1
+; GFX1164-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43]
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v40, v0
+; GFX1164-DPP-NEXT: s_mov_b64 s[44:45], 0
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1164-DPP-NEXT: .p2align 6
+; GFX1164-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1164-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1164-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX1164-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1164-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1164-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1164-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1164-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1164-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v31, v42
+; GFX1164-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v0, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v5, 8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v6, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1164-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1164-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1164-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1164-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1164-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1164-DPP-NEXT: s_clause 0x1
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1164-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v1, 0
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v2, s42
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v3, s43
+; GFX1164-DPP-NEXT: v_mov_b32_e32 v4, 0
+; GFX1164-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1164-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1164-DPP-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1164-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1164-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1164-DPP-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0
+; GFX1164-DPP-NEXT: s_or_b64 s[44:45], vcc, s[44:45]
+; GFX1164-DPP-NEXT: s_and_not1_b64 exec, exec, s[44:45]
+; GFX1164-DPP-NEXT: s_cbranch_execnz .LBB17_1
+; GFX1164-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1164-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1164-DPP-NEXT: s_endpgm
+;
+; GFX1132-DPP-LABEL: global_atomic_fsub_double_uni_address_div_value_defalut_scope_strictfp:
+; GFX1132-DPP: ; %bb.0:
+; GFX1132-DPP-NEXT: s_mov_b64 s[34:35], s[2:3]
+; GFX1132-DPP-NEXT: s_mov_b64 s[38:39], s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s0, s0, div.float.value@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, div.float.value@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v31, v0
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: s_load_b64 s[42:43], s[2:3], 0x24
+; GFX1132-DPP-NEXT: s_mov_b64 s[36:37], s[4:5]
+; GFX1132-DPP-NEXT: s_mov_b32 s40, s14
+; GFX1132-DPP-NEXT: s_mov_b32 s41, s13
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s13
+; GFX1132-DPP-NEXT: s_mov_b32 s13, s14
+; GFX1132-DPP-NEXT: s_mov_b32 s14, s15
+; GFX1132-DPP-NEXT: s_mov_b32 s32, 32
+; GFX1132-DPP-NEXT: s_mov_b32 s33, s15
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v42, v0 :: v_dual_mov_b32 v43, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v40, v0 :: v_dual_mov_b32 v41, v1
+; GFX1132-DPP-NEXT: global_load_b64 v[1:2], v43, s[42:43]
+; GFX1132-DPP-NEXT: s_mov_b32 s44, 0
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x1
+; GFX1132-DPP-NEXT: .p2align 6
+; GFX1132-DPP-NEXT: .LBB17_1: ; %atomicrmw.start
+; GFX1132-DPP-NEXT: ; =>This Inner Loop Header: Depth=1
+; GFX1132-DPP-NEXT: s_waitcnt vmcnt(0)
+; GFX1132-DPP-NEXT: v_add_f64 v[3:4], v[1:2], -v[40:41]
+; GFX1132-DPP-NEXT: s_add_u32 s8, s34, 44
+; GFX1132-DPP-NEXT: s_addc_u32 s9, s35, 0
+; GFX1132-DPP-NEXT: s_getpc_b64 s[0:1]
+; GFX1132-DPP-NEXT: s_add_u32 s0, s0, __atomic_compare_exchange@gotpcrel32@lo+4
+; GFX1132-DPP-NEXT: s_addc_u32 s1, s1, __atomic_compare_exchange@gotpcrel32@hi+12
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v31, v42 :: v_dual_mov_b32 v0, 8
+; GFX1132-DPP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v5, 8 :: v_dual_mov_b32 v6, 0
+; GFX1132-DPP-NEXT: v_mov_b32_e32 v7, 0
+; GFX1132-DPP-NEXT: s_mov_b64 s[4:5], s[38:39]
+; GFX1132-DPP-NEXT: s_mov_b64 s[10:11], s[36:37]
+; GFX1132-DPP-NEXT: s_mov_b32 s12, s41
+; GFX1132-DPP-NEXT: s_mov_b32 s13, s40
+; GFX1132-DPP-NEXT: s_mov_b32 s14, s33
+; GFX1132-DPP-NEXT: s_clause 0x1
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[1:2], off
+; GFX1132-DPP-NEXT: scratch_store_b64 off, v[3:4], off offset:8
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s42
+; GFX1132-DPP-NEXT: v_dual_mov_b32 v3, s43 :: v_dual_mov_b32 v4, 0
+; GFX1132-DPP-NEXT: s_waitcnt lgkmcnt(0)
+; GFX1132-DPP-NEXT: s_swappc_b64 s[30:31], s[0:1]
+; GFX1132-DPP-NEXT: scratch_load_b64 v[1:2], off, off
+; GFX1132-DPP-NEXT: v_and_b32_e32 v0, 1, v0
+; GFX1132-DPP-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1)
+; GFX1132-DPP-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0
+; GFX1132-DPP-NEXT: s_or_b32 s44, vcc_lo, s44
+; GFX1132-DPP-NEXT: s_and_not1_b32 exec_lo, exec_lo, s44
+; GFX1132-DPP-NEXT: s_cbranch_execnz .LBB17_1
+; GFX1132-DPP-NEXT: ; %bb.2: ; %atomicrmw.end
+; GFX1132-DPP-NEXT: s_set_inst_prefetch_distance 0x2
+; GFX1132-DPP-NEXT: s_endpgm
+ %divValue = call double @div.float.value() strictfp
+ %result = atomicrmw fsub ptr addrspace(1) %ptr, double %divValue monotonic, align 4
+ ret void
+}
+
attributes #0 = { "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
attributes #1 = { strictfp "denormal-fp-math-f32"="preserve-sign,preserve-sign" "amdgpu-unsafe-fp-atomics"="true" }
attributes #2 = { strictfp}
diff --git a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir
index bdd89a9..dde84af 100644
--- a/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir
+++ b/llvm/test/CodeGen/AMDGPU/greedy-alloc-fail-sgpr1024-spill.mir
@@ -13,6 +13,7 @@
name: greedy_fail_alloc_sgpr1024_spill
tracksRegLiveness: true
frameInfo:
+ adjustsStack: true
hasCalls: true
machineFunctionInfo:
explicitKernArgSize: 16
diff --git a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
index a5792bf..4c21f87 100644
--- a/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
+++ b/llvm/test/CodeGen/AMDGPU/implicitarg-offset-attributes.ll
@@ -258,25 +258,25 @@ attributes #0 = { nocallback nofree nosync nounwind speculatable willreturn memo
;.
; V4: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; V4: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR4]] = { "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V4: attributes #[[ATTR5]] = { "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V4: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
;.
; V5: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; V5: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR4]] = { "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V5: attributes #[[ATTR5]] = { "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V5: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
;.
; V6: attributes #[[ATTR0:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
-; V6: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR4]] = { "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
-; V6: attributes #[[ATTR5]] = { "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR4]] = { "amdgpu-no-agpr" "amdgpu-no-default-queue" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
+; V6: attributes #[[ATTR5]] = { "amdgpu-no-agpr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-multigrid-sync-arg" "uniform-work-group-size"="false" }
;.
; V4: [[META0:![0-9]+]] = !{i32 1, !"amdhsa_code_object_version", i32 400}
;.
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
index e015095a..ab160ff 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-cc.ll
@@ -92,7 +92,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc(<4 x i32> inreg %a, <4 x i32> %b
; DAGISEL-GFX11-NEXT: $vgpr5 = COPY [[COPY2]]
; DAGISEL-GFX11-NEXT: $vgpr6 = COPY [[COPY1]]
; DAGISEL-GFX11-NEXT: $vgpr7 = COPY [[COPY]]
- ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; DAGISEL-GFX11-NEXT: S_ENDPGM 0
@@ -122,7 +121,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc(<4 x i32> inreg %a, <4 x i32> %b
; DAGISEL-GFX10-NEXT: $vgpr5 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr6 = COPY [[COPY1]]
; DAGISEL-GFX10-NEXT: $vgpr7 = COPY [[COPY]]
- ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7
; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; DAGISEL-GFX10-NEXT: S_ENDPGM 0
@@ -234,7 +232,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_ptr(ptr inreg %a, ptr %b, ptr ad
; DAGISEL-GFX11-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX11-NEXT: $vgpr10 = COPY [[COPY1]]
; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY]]
- ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; DAGISEL-GFX11-NEXT: S_ENDPGM 0
@@ -272,7 +269,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_ptr(ptr inreg %a, ptr %b, ptr ad
; DAGISEL-GFX10-NEXT: $vgpr9 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr10 = COPY [[COPY1]]
; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY]]
- ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11
; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; DAGISEL-GFX10-NEXT: S_ENDPGM 0
@@ -404,7 +400,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_struct( {ptr, i32, <4 x i32>} in
; DAGISEL-GFX11-NEXT: $vgpr11 = COPY [[COPY2]]
; DAGISEL-GFX11-NEXT: $vgpr12 = COPY [[COPY1]]
; DAGISEL-GFX11-NEXT: $vgpr13 = COPY [[COPY]]
- ; DAGISEL-GFX11-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13
; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; DAGISEL-GFX11-NEXT: S_ENDPGM 0
@@ -454,7 +449,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_struct( {ptr, i32, <4 x i32>} in
; DAGISEL-GFX10-NEXT: $vgpr11 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr12 = COPY [[COPY1]]
; DAGISEL-GFX10-NEXT: $vgpr13 = COPY [[COPY]]
- ; DAGISEL-GFX10-NEXT: [[DEF2:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13
; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; DAGISEL-GFX10-NEXT: S_ENDPGM 0
@@ -506,7 +500,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_float(float inreg %a, float %b)
; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
; DAGISEL-GFX11-NEXT: $vgpr0 = COPY [[COPY1]]
; DAGISEL-GFX11-NEXT: $vgpr1 = COPY [[COPY]]
- ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1
; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; DAGISEL-GFX11-NEXT: S_ENDPGM 0
@@ -524,7 +517,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_float(float inreg %a, float %b)
; DAGISEL-GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr0 = COPY [[COPY1]]
; DAGISEL-GFX10-NEXT: $vgpr1 = COPY [[COPY]]
- ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1
; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; DAGISEL-GFX10-NEXT: S_ENDPGM 0
@@ -576,7 +568,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_half(half inreg %a, half %b) {
; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
; DAGISEL-GFX11-NEXT: $vgpr0 = COPY [[COPY1]]
; DAGISEL-GFX11-NEXT: $vgpr1 = COPY [[COPY]]
- ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1
; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; DAGISEL-GFX11-NEXT: S_ENDPGM 0
@@ -594,7 +585,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_half(half inreg %a, half %b) {
; DAGISEL-GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr0 = COPY [[COPY1]]
; DAGISEL-GFX10-NEXT: $vgpr1 = COPY [[COPY]]
- ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1
; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; DAGISEL-GFX10-NEXT: S_ENDPGM 0
@@ -646,7 +636,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, bfloat %
; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
; DAGISEL-GFX11-NEXT: $vgpr0 = COPY [[COPY1]]
; DAGISEL-GFX11-NEXT: $vgpr1 = COPY [[COPY]]
- ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1
; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; DAGISEL-GFX11-NEXT: S_ENDPGM 0
@@ -664,7 +653,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_bfloat(bfloat inreg %a, bfloat %
; DAGISEL-GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr0 = COPY [[COPY1]]
; DAGISEL-GFX10-NEXT: $vgpr1 = COPY [[COPY]]
- ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1
; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; DAGISEL-GFX10-NEXT: S_ENDPGM 0
@@ -716,7 +704,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_i16(i16 inreg %a, i16 %b) {
; DAGISEL-GFX11-NEXT: [[S_LOAD_DWORDX2_IMM:%[0-9]+]]:sreg_64_xexec = S_LOAD_DWORDX2_IMM killed [[SI_PC_ADD_REL_OFFSET]], 0, 0 :: (dereferenceable invariant load (s64) from got, addrspace 4)
; DAGISEL-GFX11-NEXT: $vgpr0 = COPY [[COPY1]]
; DAGISEL-GFX11-NEXT: $vgpr1 = COPY [[COPY]]
- ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1
; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; DAGISEL-GFX11-NEXT: S_ENDPGM 0
@@ -734,7 +721,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_i16(i16 inreg %a, i16 %b) {
; DAGISEL-GFX10-NEXT: $sgpr0_sgpr1_sgpr2_sgpr3 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr0 = COPY [[COPY1]]
; DAGISEL-GFX10-NEXT: $vgpr1 = COPY [[COPY]]
- ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1
; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; DAGISEL-GFX10-NEXT: S_ENDPGM 0
@@ -870,7 +856,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_v16i16(<16 x i16> inreg %a, <16
; DAGISEL-GFX11-NEXT: $vgpr13 = COPY [[COPY2]]
; DAGISEL-GFX11-NEXT: $vgpr14 = COPY [[COPY1]]
; DAGISEL-GFX11-NEXT: $vgpr15 = COPY [[COPY]]
- ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; DAGISEL-GFX11-NEXT: S_ENDPGM 0
@@ -916,7 +901,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_cc_v16i16(<16 x i16> inreg %a, <16
; DAGISEL-GFX10-NEXT: $vgpr13 = COPY [[COPY2]]
; DAGISEL-GFX10-NEXT: $vgpr14 = COPY [[COPY1]]
; DAGISEL-GFX10-NEXT: $vgpr15 = COPY [[COPY]]
- ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15
; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; DAGISEL-GFX10-NEXT: S_ENDPGM 0
@@ -2480,7 +2464,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_many_regs(<36 x i32> inreg %a, <128
; DAGISEL-GFX11-NEXT: $vgpr29 = COPY [[COPY134]]
; DAGISEL-GFX11-NEXT: $vgpr30 = COPY [[COPY133]]
; DAGISEL-GFX11-NEXT: $vgpr31 = COPY [[COPY132]]
- ; DAGISEL-GFX11-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX11-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $vgpr31
; DAGISEL-GFX11-NEXT: ADJCALLSTACKDOWN 0, 528, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; DAGISEL-GFX11-NEXT: S_ENDPGM 0
@@ -2827,7 +2810,6 @@ define amdgpu_cs_chain void @amdgpu_cs_chain_many_regs(<36 x i32> inreg %a, <128
; DAGISEL-GFX10-NEXT: $vgpr29 = COPY [[COPY134]]
; DAGISEL-GFX10-NEXT: $vgpr30 = COPY [[COPY133]]
; DAGISEL-GFX10-NEXT: $vgpr31 = COPY [[COPY132]]
- ; DAGISEL-GFX10-NEXT: [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
; DAGISEL-GFX10-NEXT: $sgpr30_sgpr31 = SI_CALL killed [[S_LOAD_DWORDX2_IMM]], @use, csr_amdgpu_si_gfx, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3, implicit $vgpr4, implicit $vgpr5, implicit $vgpr6, implicit $vgpr7, implicit $vgpr8, implicit $vgpr9, implicit $vgpr10, implicit $vgpr11, implicit $vgpr12, implicit $vgpr13, implicit $vgpr14, implicit $vgpr15, implicit $vgpr16, implicit $vgpr17, implicit $vgpr18, implicit $vgpr19, implicit $vgpr20, implicit $vgpr21, implicit $vgpr22, implicit $vgpr23, implicit $vgpr24, implicit $vgpr25, implicit $vgpr26, implicit $vgpr27, implicit $vgpr28, implicit $vgpr29, implicit $vgpr30, implicit $vgpr31
; DAGISEL-GFX10-NEXT: ADJCALLSTACKDOWN 0, 528, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32
; DAGISEL-GFX10-NEXT: S_ENDPGM 0
diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
new file mode 100644
index 0000000..bfeb214
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
@@ -0,0 +1,1618 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,SDAG %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 < %s | FileCheck -check-prefixes=GCN,GISEL %s
+
+define float @sitofp_i128_to_f32(i128 %x) {
+; SDAG-LABEL: sitofp_i128_to_f32:
+; SDAG: ; %bb.0: ; %itofp-entry
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_or_b32_e32 v5, v1, v3
+; SDAG-NEXT: v_or_b32_e32 v4, v0, v2
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT: s_cbranch_execz .LBB0_14
+; SDAG-NEXT: ; %bb.1: ; %itofp-if-end
+; SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v3
+; SDAG-NEXT: v_xor_b32_e32 v0, v5, v0
+; SDAG-NEXT: v_xor_b32_e32 v1, v5, v1
+; SDAG-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v5
+; SDAG-NEXT: v_xor_b32_e32 v2, v5, v2
+; SDAG-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
+; SDAG-NEXT: v_xor_b32_e32 v6, v5, v3
+; SDAG-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v5, vcc
+; SDAG-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
+; SDAG-NEXT: v_ffbh_u32_e32 v2, v4
+; SDAG-NEXT: v_add_u32_e32 v2, 32, v2
+; SDAG-NEXT: v_ffbh_u32_e32 v6, v5
+; SDAG-NEXT: v_min_u32_e32 v2, v2, v6
+; SDAG-NEXT: v_ffbh_u32_e32 v6, v0
+; SDAG-NEXT: v_add_u32_e32 v6, 32, v6
+; SDAG-NEXT: v_ffbh_u32_e32 v7, v1
+; SDAG-NEXT: v_min_u32_e32 v6, v6, v7
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT: v_add_u32_e32 v6, 64, v6
+; SDAG-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc
+; SDAG-NEXT: v_sub_u32_e32 v6, 0x80, v7
+; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v7
+; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v6
+; SDAG-NEXT: ; implicit-def: $vgpr8
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; SDAG-NEXT: ; %bb.2: ; %itofp-if-else
+; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff98, v7
+; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
+; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4
+; SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc
+; SDAG-NEXT: ; implicit-def: $vgpr6
+; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT: ; implicit-def: $vgpr7
+; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; SDAG-NEXT: ; %bb.3: ; %Flow3
+; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB0_13
+; SDAG-NEXT: ; %bb.4: ; %NodeBlock
+; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v6
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB0_8
+; SDAG-NEXT: ; %bb.5: ; %LeafBlock
+; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v6
+; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc
+; SDAG-NEXT: s_cbranch_execz .LBB0_7
+; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default
+; SDAG-NEXT: v_sub_u32_e32 v12, 0x66, v7
+; SDAG-NEXT: v_sub_u32_e32 v10, 64, v12
+; SDAG-NEXT: v_lshrrev_b64 v[8:9], v12, v[0:1]
+; SDAG-NEXT: v_lshlrev_b64 v[10:11], v10, v[4:5]
+; SDAG-NEXT: v_sub_u32_e32 v13, 38, v7
+; SDAG-NEXT: v_or_b32_e32 v11, v9, v11
+; SDAG-NEXT: v_or_b32_e32 v10, v8, v10
+; SDAG-NEXT: v_lshrrev_b64 v[8:9], v13, v[4:5]
+; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12
+; SDAG-NEXT: v_add_u32_e32 v14, 26, v7
+; SDAG-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12
+; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc
+; SDAG-NEXT: v_lshrrev_b64 v[10:11], v13, v[0:1]
+; SDAG-NEXT: v_lshlrev_b64 v[12:13], v14, v[4:5]
+; SDAG-NEXT: v_subrev_u32_e32 v7, 38, v7
+; SDAG-NEXT: v_cndmask_b32_e64 v15, v8, v0, s[4:5]
+; SDAG-NEXT: v_lshlrev_b64 v[7:8], v7, v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v11, v13, v11
+; SDAG-NEXT: v_or_b32_e32 v10, v12, v10
+; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14
+; SDAG-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14
+; SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT: v_or_b32_e32 v1, v1, v5
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v4
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v8, v15, v0
+; SDAG-NEXT: v_mov_b32_e32 v0, v8
+; SDAG-NEXT: v_mov_b32_e32 v1, v9
+; SDAG-NEXT: .LBB0_7: ; %Flow1
+; SDAG-NEXT: s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT: .LBB0_8: ; %Flow2
+; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11]
+; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb
+; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
+; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT: v_lshrrev_b32_e32 v4, 2, v0
+; SDAG-NEXT: v_and_or_b32 v0, v4, 1, v0
+; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
+; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; SDAG-NEXT: v_and_b32_e32 v4, 0x4000000, v0
+; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; SDAG-NEXT: v_alignbit_b32 v8, v1, v0, 2
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20
+; SDAG-NEXT: v_alignbit_b32 v8, v1, v0, 3
+; SDAG-NEXT: v_mov_b32_e32 v2, v6
+; SDAG-NEXT: ; %bb.12: ; %Flow
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT: .LBB0_13: ; %Flow4
+; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT: v_and_b32_e32 v0, 0x80000000, v3
+; SDAG-NEXT: v_lshl_add_u32 v1, v2, 23, 1.0
+; SDAG-NEXT: v_and_b32_e32 v2, 0x7fffff, v8
+; SDAG-NEXT: v_or3_b32 v4, v2, v0, v1
+; SDAG-NEXT: .LBB0_14: ; %Flow5
+; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v0, v4
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: sitofp_i128_to_f32:
+; GISEL: ; %bb.0: ; %itofp-entry
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_or_b32_e32 v4, v0, v2
+; GISEL-NEXT: v_or_b32_e32 v5, v1, v3
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: v_mov_b32_e32 v4, s4
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT: s_cbranch_execz .LBB0_14
+; GISEL-NEXT: ; %bb.1: ; %itofp-if-end
+; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v3
+; GISEL-NEXT: v_xor_b32_e32 v0, v6, v0
+; GISEL-NEXT: v_xor_b32_e32 v1, v6, v1
+; GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v6
+; GISEL-NEXT: v_xor_b32_e32 v2, v6, v2
+; GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
+; GISEL-NEXT: v_xor_b32_e32 v3, v6, v3
+; GISEL-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc
+; GISEL-NEXT: v_ffbh_u32_e32 v5, v0
+; GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT: v_ffbh_u32_e32 v4, v1
+; GISEL-NEXT: v_add_u32_e32 v5, 32, v5
+; GISEL-NEXT: v_ffbh_u32_e32 v7, v2
+; GISEL-NEXT: v_min_u32_e32 v4, v4, v5
+; GISEL-NEXT: v_ffbh_u32_e32 v5, v3
+; GISEL-NEXT: v_add_u32_e32 v7, 32, v7
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT: v_add_u32_e32 v4, 64, v4
+; GISEL-NEXT: v_min_u32_e32 v5, v5, v7
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc
+; GISEL-NEXT: v_sub_u32_e32 v8, 0x80, v5
+; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v5
+; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v8
+; GISEL-NEXT: ; implicit-def: $vgpr4
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GISEL-NEXT: ; %bb.2: ; %itofp-if-else
+; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v5
+; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
+; GISEL-NEXT: ; implicit-def: $vgpr8
+; GISEL-NEXT: ; implicit-def: $vgpr0
+; GISEL-NEXT: ; implicit-def: $vgpr5
+; GISEL-NEXT: ; implicit-def: $vgpr2
+; GISEL-NEXT: ; %bb.3: ; %Flow3
+; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB0_13
+; GISEL-NEXT: ; %bb.4: ; %NodeBlock
+; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v8
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB0_8
+; GISEL-NEXT: ; %bb.5: ; %LeafBlock
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v8
+; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT: s_cbranch_execz .LBB0_7
+; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default
+; GISEL-NEXT: v_sub_u32_e32 v4, 0x66, v5
+; GISEL-NEXT: v_sub_u32_e32 v11, 64, v4
+; GISEL-NEXT: v_lshrrev_b64 v[9:10], v4, v[0:1]
+; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, v[2:3]
+; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v4
+; GISEL-NEXT: v_or_b32_e32 v11, v9, v11
+; GISEL-NEXT: v_or_b32_e32 v12, v10, v12
+; GISEL-NEXT: v_lshrrev_b64 v[9:10], v13, v[2:3]
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4
+; GISEL-NEXT: v_add_u32_e32 v5, 26, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GISEL-NEXT: v_sub_u32_e32 v11, 64, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v13, v9, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v10, v1, vcc
+; GISEL-NEXT: v_lshrrev_b64 v[9:10], v5, -1
+; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, -1
+; GISEL-NEXT: v_subrev_u32_e32 v14, 64, v5
+; GISEL-NEXT: v_or_b32_e32 v15, v9, v11
+; GISEL-NEXT: v_or_b32_e32 v16, v10, v12
+; GISEL-NEXT: v_lshrrev_b64 v[11:12], v14, -1
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v11, -1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v11, v12, -1, s[4:5]
+; GISEL-NEXT: v_and_b32_e32 v2, v9, v2
+; GISEL-NEXT: v_and_b32_e32 v3, v10, v3
+; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2
+; GISEL-NEXT: v_and_or_b32 v1, v11, v1, v3
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v3, v13, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GISEL-NEXT: v_mov_b32_e32 v1, v4
+; GISEL-NEXT: v_mov_b32_e32 v2, v5
+; GISEL-NEXT: v_mov_b32_e32 v3, v6
+; GISEL-NEXT: .LBB0_7: ; %Flow1
+; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: .LBB0_8: ; %Flow2
+; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11]
+; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb
+; GISEL-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog
+; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT: v_bfe_u32 v2, v0, 2, 1
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
+; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT: v_and_b32_e32 v2, 0x4000000, v0
+; GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1]
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20
+; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v7, v8
+; GISEL-NEXT: ; %bb.12: ; %Flow
+; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT: .LBB0_13: ; %Flow4
+; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT: v_and_b32_e32 v0, 0x80000000, v6
+; GISEL-NEXT: v_lshl_add_u32 v1, v7, 23, 1.0
+; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4
+; GISEL-NEXT: v_or3_b32 v4, v2, v0, v1
+; GISEL-NEXT: .LBB0_14: ; %Flow5
+; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT: v_mov_b32_e32 v0, v4
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %cvt = sitofp i128 %x to float
+ ret float %cvt
+}
+
+define float @uitofp_i128_to_f32(i128 %x) {
+; SDAG-LABEL: uitofp_i128_to_f32:
+; SDAG: ; %bb.0: ; %itofp-entry
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_or_b32_e32 v5, v1, v3
+; SDAG-NEXT: v_or_b32_e32 v4, v0, v2
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT: s_cbranch_execz .LBB1_14
+; SDAG-NEXT: ; %bb.1: ; %itofp-if-end
+; SDAG-NEXT: v_ffbh_u32_e32 v4, v2
+; SDAG-NEXT: v_add_u32_e32 v4, 32, v4
+; SDAG-NEXT: v_ffbh_u32_e32 v5, v3
+; SDAG-NEXT: v_min_u32_e32 v4, v4, v5
+; SDAG-NEXT: v_ffbh_u32_e32 v5, v0
+; SDAG-NEXT: v_add_u32_e32 v5, 32, v5
+; SDAG-NEXT: v_ffbh_u32_e32 v6, v1
+; SDAG-NEXT: v_min_u32_e32 v5, v5, v6
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT: v_add_u32_e32 v5, 64, v5
+; SDAG-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc
+; SDAG-NEXT: v_sub_u32_e32 v5, 0x80, v6
+; SDAG-NEXT: v_sub_u32_e32 v4, 0x7f, v6
+; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v5
+; SDAG-NEXT: ; implicit-def: $vgpr7
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; SDAG-NEXT: ; %bb.2: ; %itofp-if-else
+; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff98, v6
+; SDAG-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
+; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
+; SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v0, vcc
+; SDAG-NEXT: ; implicit-def: $vgpr5
+; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT: ; implicit-def: $vgpr6
+; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT: ; %bb.3: ; %Flow3
+; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB1_13
+; SDAG-NEXT: ; %bb.4: ; %NodeBlock
+; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v5
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB1_8
+; SDAG-NEXT: ; %bb.5: ; %LeafBlock
+; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v5
+; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc
+; SDAG-NEXT: s_cbranch_execz .LBB1_7
+; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default
+; SDAG-NEXT: v_sub_u32_e32 v11, 0x66, v6
+; SDAG-NEXT: v_sub_u32_e32 v9, 64, v11
+; SDAG-NEXT: v_lshrrev_b64 v[7:8], v11, v[0:1]
+; SDAG-NEXT: v_lshlrev_b64 v[9:10], v9, v[2:3]
+; SDAG-NEXT: v_sub_u32_e32 v12, 38, v6
+; SDAG-NEXT: v_or_b32_e32 v10, v8, v10
+; SDAG-NEXT: v_or_b32_e32 v9, v7, v9
+; SDAG-NEXT: v_lshrrev_b64 v[7:8], v12, v[2:3]
+; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11
+; SDAG-NEXT: v_add_u32_e32 v13, 26, v6
+; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11
+; SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
+; SDAG-NEXT: v_lshrrev_b64 v[9:10], v12, v[0:1]
+; SDAG-NEXT: v_lshlrev_b64 v[11:12], v13, v[2:3]
+; SDAG-NEXT: v_subrev_u32_e32 v6, 38, v6
+; SDAG-NEXT: v_cndmask_b32_e64 v14, v7, v0, s[4:5]
+; SDAG-NEXT: v_lshlrev_b64 v[6:7], v6, v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v10, v12, v10
+; SDAG-NEXT: v_or_b32_e32 v9, v11, v9
+; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13
+; SDAG-NEXT: v_lshlrev_b64 v[0:1], v13, v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13
+; SDAG-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT: v_or_b32_e32 v1, v1, v3
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v2
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v7, v14, v0
+; SDAG-NEXT: v_mov_b32_e32 v0, v7
+; SDAG-NEXT: v_mov_b32_e32 v1, v8
+; SDAG-NEXT: .LBB1_7: ; %Flow1
+; SDAG-NEXT: s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT: .LBB1_8: ; %Flow2
+; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11]
+; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb
+; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
+; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT: v_lshrrev_b32_e32 v2, 2, v0
+; SDAG-NEXT: v_and_or_b32 v0, v2, 1, v0
+; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
+; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; SDAG-NEXT: v_and_b32_e32 v2, 0x4000000, v0
+; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; SDAG-NEXT: v_alignbit_b32 v7, v1, v0, 2
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20
+; SDAG-NEXT: v_alignbit_b32 v7, v1, v0, 3
+; SDAG-NEXT: v_mov_b32_e32 v4, v5
+; SDAG-NEXT: ; %bb.12: ; %Flow
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT: .LBB1_13: ; %Flow4
+; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v7
+; SDAG-NEXT: v_lshl_or_b32 v0, v4, 23, v0
+; SDAG-NEXT: v_add_u32_e32 v4, 1.0, v0
+; SDAG-NEXT: .LBB1_14: ; %Flow5
+; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v0, v4
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: uitofp_i128_to_f32:
+; GISEL: ; %bb.0: ; %itofp-entry
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_or_b32_e32 v4, v0, v2
+; GISEL-NEXT: v_or_b32_e32 v5, v1, v3
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: v_mov_b32_e32 v4, s4
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT: s_cbranch_execz .LBB1_14
+; GISEL-NEXT: ; %bb.1: ; %itofp-if-end
+; GISEL-NEXT: v_ffbh_u32_e32 v5, v0
+; GISEL-NEXT: v_ffbh_u32_e32 v4, v1
+; GISEL-NEXT: v_add_u32_e32 v5, 32, v5
+; GISEL-NEXT: v_ffbh_u32_e32 v6, v2
+; GISEL-NEXT: v_min_u32_e32 v4, v4, v5
+; GISEL-NEXT: v_ffbh_u32_e32 v5, v3
+; GISEL-NEXT: v_add_u32_e32 v6, 32, v6
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT: v_add_u32_e32 v4, 64, v4
+; GISEL-NEXT: v_min_u32_e32 v5, v5, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc
+; GISEL-NEXT: v_sub_u32_e32 v7, 0x80, v5
+; GISEL-NEXT: v_sub_u32_e32 v6, 0x7f, v5
+; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v7
+; GISEL-NEXT: ; implicit-def: $vgpr4
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GISEL-NEXT: ; %bb.2: ; %itofp-if-else
+; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v5
+; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
+; GISEL-NEXT: ; implicit-def: $vgpr7
+; GISEL-NEXT: ; implicit-def: $vgpr0
+; GISEL-NEXT: ; implicit-def: $vgpr5
+; GISEL-NEXT: ; implicit-def: $vgpr2
+; GISEL-NEXT: ; %bb.3: ; %Flow3
+; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB1_13
+; GISEL-NEXT: ; %bb.4: ; %NodeBlock
+; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v7
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB1_8
+; GISEL-NEXT: ; %bb.5: ; %LeafBlock
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v7
+; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT: s_cbranch_execz .LBB1_7
+; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default
+; GISEL-NEXT: v_sub_u32_e32 v4, 0x66, v5
+; GISEL-NEXT: v_sub_u32_e32 v10, 64, v4
+; GISEL-NEXT: v_lshrrev_b64 v[8:9], v4, v[0:1]
+; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3]
+; GISEL-NEXT: v_subrev_u32_e32 v12, 64, v4
+; GISEL-NEXT: v_or_b32_e32 v10, v8, v10
+; GISEL-NEXT: v_or_b32_e32 v11, v9, v11
+; GISEL-NEXT: v_lshrrev_b64 v[8:9], v12, v[2:3]
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4
+; GISEL-NEXT: v_add_u32_e32 v5, 26, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GISEL-NEXT: v_sub_u32_e32 v10, 64, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v8, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v1, vcc
+; GISEL-NEXT: v_lshrrev_b64 v[8:9], v5, -1
+; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, -1
+; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v5
+; GISEL-NEXT: v_or_b32_e32 v14, v8, v10
+; GISEL-NEXT: v_or_b32_e32 v15, v9, v11
+; GISEL-NEXT: v_lshrrev_b64 v[10:11], v13, -1
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v10, -1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v10, v11, -1, s[4:5]
+; GISEL-NEXT: v_and_b32_e32 v2, v8, v2
+; GISEL-NEXT: v_and_b32_e32 v3, v9, v3
+; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2
+; GISEL-NEXT: v_and_or_b32 v1, v10, v1, v3
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v3, v12, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GISEL-NEXT: v_mov_b32_e32 v1, v4
+; GISEL-NEXT: v_mov_b32_e32 v2, v5
+; GISEL-NEXT: v_mov_b32_e32 v3, v6
+; GISEL-NEXT: .LBB1_7: ; %Flow1
+; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: .LBB1_8: ; %Flow2
+; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11]
+; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb
+; GISEL-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog
+; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT: v_bfe_u32 v2, v0, 2, 1
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
+; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT: v_and_b32_e32 v2, 0x4000000, v0
+; GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1]
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20
+; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v6, v7
+; GISEL-NEXT: ; %bb.12: ; %Flow
+; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT: .LBB1_13: ; %Flow4
+; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT: v_lshl_add_u32 v0, v6, 23, 1.0
+; GISEL-NEXT: v_mov_b32_e32 v1, 0x7fffff
+; GISEL-NEXT: v_and_or_b32 v4, v4, v1, v0
+; GISEL-NEXT: .LBB1_14: ; %Flow5
+; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT: v_mov_b32_e32 v0, v4
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %cvt = uitofp i128 %x to float
+ ret float %cvt
+}
+
+define double @sitofp_i128_to_f64(i128 %x) {
+; SDAG-LABEL: sitofp_i128_to_f64:
+; SDAG: ; %bb.0: ; %itofp-entry
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_mov_b32_e32 v5, v1
+; SDAG-NEXT: v_mov_b32_e32 v4, v0
+; SDAG-NEXT: v_or_b32_e32 v1, v5, v3
+; SDAG-NEXT: v_or_b32_e32 v0, v4, v2
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT: v_mov_b32_e32 v0, 0
+; SDAG-NEXT: v_mov_b32_e32 v1, 0
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT: s_cbranch_execz .LBB2_14
+; SDAG-NEXT: ; %bb.1: ; %itofp-if-end
+; SDAG-NEXT: v_ashrrev_i32_e32 v0, 31, v3
+; SDAG-NEXT: v_xor_b32_e32 v4, v0, v4
+; SDAG-NEXT: v_xor_b32_e32 v5, v0, v5
+; SDAG-NEXT: v_sub_co_u32_e32 v4, vcc, v4, v0
+; SDAG-NEXT: v_xor_b32_e32 v2, v0, v2
+; SDAG-NEXT: v_subb_co_u32_e32 v5, vcc, v5, v0, vcc
+; SDAG-NEXT: v_xor_b32_e32 v1, v0, v3
+; SDAG-NEXT: v_subb_co_u32_e32 v6, vcc, v2, v0, vcc
+; SDAG-NEXT: v_subb_co_u32_e32 v7, vcc, v1, v0, vcc
+; SDAG-NEXT: v_ffbh_u32_e32 v0, v6
+; SDAG-NEXT: v_add_u32_e32 v0, 32, v0
+; SDAG-NEXT: v_ffbh_u32_e32 v1, v7
+; SDAG-NEXT: v_min_u32_e32 v0, v0, v1
+; SDAG-NEXT: v_ffbh_u32_e32 v1, v4
+; SDAG-NEXT: v_add_u32_e32 v1, 32, v1
+; SDAG-NEXT: v_ffbh_u32_e32 v2, v5
+; SDAG-NEXT: v_min_u32_e32 v1, v1, v2
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[6:7]
+; SDAG-NEXT: v_add_u32_e32 v1, 64, v1
+; SDAG-NEXT: v_cndmask_b32_e32 v9, v1, v0, vcc
+; SDAG-NEXT: v_sub_u32_e32 v8, 0x80, v9
+; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v9
+; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 54, v8
+; SDAG-NEXT: ; implicit-def: $vgpr10
+; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; SDAG-NEXT: ; %bb.2: ; %itofp-if-else
+; SDAG-NEXT: v_add_u32_e32 v6, 0xffffffb5, v9
+; SDAG-NEXT: v_lshlrev_b64 v[0:1], v6, v[4:5]
+; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v6
+; SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT: ; implicit-def: $vgpr8
+; SDAG-NEXT: ; implicit-def: $vgpr6_vgpr7
+; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; SDAG-NEXT: ; implicit-def: $vgpr9
+; SDAG-NEXT: ; %bb.3: ; %Flow3
+; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB2_13
+; SDAG-NEXT: ; %bb.4: ; %NodeBlock
+; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 54, v8
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB2_8
+; SDAG-NEXT: ; %bb.5: ; %LeafBlock
+; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 55, v8
+; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc
+; SDAG-NEXT: s_cbranch_execz .LBB2_7
+; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default
+; SDAG-NEXT: v_sub_u32_e32 v12, 0x49, v9
+; SDAG-NEXT: v_sub_u32_e32 v10, 64, v12
+; SDAG-NEXT: v_lshrrev_b64 v[0:1], v12, v[4:5]
+; SDAG-NEXT: v_lshlrev_b64 v[10:11], v10, v[6:7]
+; SDAG-NEXT: v_sub_u32_e32 v13, 9, v9
+; SDAG-NEXT: v_or_b32_e32 v11, v1, v11
+; SDAG-NEXT: v_or_b32_e32 v10, v0, v10
+; SDAG-NEXT: v_lshrrev_b64 v[0:1], v13, v[6:7]
+; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12
+; SDAG-NEXT: v_add_u32_e32 v16, 55, v9
+; SDAG-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12
+; SDAG-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc
+; SDAG-NEXT: v_lshrrev_b64 v[10:11], v12, v[6:7]
+; SDAG-NEXT: v_lshrrev_b64 v[12:13], v13, v[4:5]
+; SDAG-NEXT: v_lshlrev_b64 v[14:15], v16, v[6:7]
+; SDAG-NEXT: v_add_u32_e32 v9, -9, v9
+; SDAG-NEXT: v_or_b32_e32 v15, v15, v13
+; SDAG-NEXT: v_or_b32_e32 v14, v14, v12
+; SDAG-NEXT: v_lshlrev_b64 v[12:13], v9, v[4:5]
+; SDAG-NEXT: v_cndmask_b32_e32 v11, 0, v11, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
+; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v16
+; SDAG-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e32 v9, v13, v15, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v16
+; SDAG-NEXT: v_lshlrev_b64 v[4:5], v16, v[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v7, v9, v7, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e32 v9, v12, v14, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v6, v9, v6, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; SDAG-NEXT: v_or_b32_e32 v5, v5, v7
+; SDAG-NEXT: v_or_b32_e32 v4, v4, v6
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v6, v10
+; SDAG-NEXT: v_cndmask_b32_e64 v4, 0, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v4
+; SDAG-NEXT: v_mov_b32_e32 v5, v1
+; SDAG-NEXT: v_mov_b32_e32 v4, v0
+; SDAG-NEXT: v_mov_b32_e32 v7, v11
+; SDAG-NEXT: .LBB2_7: ; %Flow1
+; SDAG-NEXT: s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT: .LBB2_8: ; %Flow2
+; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11]
+; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb
+; SDAG-NEXT: v_lshlrev_b64 v[6:7], 1, v[6:7]
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 31, v5
+; SDAG-NEXT: v_lshlrev_b64 v[4:5], 1, v[4:5]
+; SDAG-NEXT: v_or_b32_e32 v6, v6, v0
+; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT: v_lshrrev_b32_e32 v0, 2, v4
+; SDAG-NEXT: v_and_or_b32 v0, v0, 1, v4
+; SDAG-NEXT: v_add_co_u32_e32 v4, vcc, 1, v0
+; SDAG-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
+; SDAG-NEXT: v_addc_co_u32_e32 v6, vcc, 0, v6, vcc
+; SDAG-NEXT: v_lshrrev_b64 v[0:1], 2, v[4:5]
+; SDAG-NEXT: v_lshlrev_b32_e32 v7, 30, v6
+; SDAG-NEXT: v_or_b32_e32 v10, v1, v7
+; SDAG-NEXT: v_and_b32_e32 v1, 0x800000, v5
+; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v1
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20
+; SDAG-NEXT: v_lshrrev_b64 v[0:1], 3, v[4:5]
+; SDAG-NEXT: v_lshlrev_b32_e32 v2, 29, v6
+; SDAG-NEXT: v_or_b32_e32 v10, v1, v2
+; SDAG-NEXT: v_mov_b32_e32 v2, v8
+; SDAG-NEXT: ; %bb.12: ; %Flow
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT: .LBB2_13: ; %Flow4
+; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT: v_and_b32_e32 v1, 0x80000000, v3
+; SDAG-NEXT: v_mov_b32_e32 v3, 0x3ff00000
+; SDAG-NEXT: v_lshl_add_u32 v2, v2, 20, v3
+; SDAG-NEXT: v_and_b32_e32 v3, 0xfffff, v10
+; SDAG-NEXT: v_or3_b32 v1, v3, v1, v2
+; SDAG-NEXT: .LBB2_14: ; %Flow5
+; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: sitofp_i128_to_f64:
+; GISEL: ; %bb.0: ; %itofp-entry
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_mov_b32_e32 v4, v0
+; GISEL-NEXT: v_mov_b32_e32 v5, v1
+; GISEL-NEXT: s_mov_b64 s[4:5], 0
+; GISEL-NEXT: v_or_b32_e32 v0, v4, v2
+; GISEL-NEXT: v_or_b32_e32 v1, v5, v3
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v0, s4
+; GISEL-NEXT: v_mov_b32_e32 v1, s5
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT: s_cbranch_execz .LBB2_14
+; GISEL-NEXT: ; %bb.1: ; %itofp-if-end
+; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v3
+; GISEL-NEXT: v_xor_b32_e32 v0, v6, v4
+; GISEL-NEXT: v_xor_b32_e32 v1, v6, v5
+; GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v6
+; GISEL-NEXT: v_xor_b32_e32 v2, v6, v2
+; GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
+; GISEL-NEXT: v_xor_b32_e32 v3, v6, v3
+; GISEL-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc
+; GISEL-NEXT: v_ffbh_u32_e32 v5, v0
+; GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT: v_ffbh_u32_e32 v4, v1
+; GISEL-NEXT: v_add_u32_e32 v5, 32, v5
+; GISEL-NEXT: v_ffbh_u32_e32 v7, v2
+; GISEL-NEXT: v_min_u32_e32 v4, v4, v5
+; GISEL-NEXT: v_ffbh_u32_e32 v5, v3
+; GISEL-NEXT: v_add_u32_e32 v7, 32, v7
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT: v_add_u32_e32 v4, 64, v4
+; GISEL-NEXT: v_min_u32_e32 v5, v5, v7
+; GISEL-NEXT: v_cndmask_b32_e32 v9, v5, v4, vcc
+; GISEL-NEXT: v_sub_u32_e32 v8, 0x80, v9
+; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v9
+; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 53, v8
+; GISEL-NEXT: ; implicit-def: $vgpr10
+; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GISEL-NEXT: ; %bb.2: ; %itofp-if-else
+; GISEL-NEXT: v_add_u32_e32 v2, 0xffffffb5, v9
+; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v1, vcc
+; GISEL-NEXT: ; implicit-def: $vgpr8
+; GISEL-NEXT: ; implicit-def: $vgpr0
+; GISEL-NEXT: ; implicit-def: $vgpr9
+; GISEL-NEXT: ; %bb.3: ; %Flow3
+; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB2_13
+; GISEL-NEXT: ; %bb.4: ; %NodeBlock
+; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 55, v8
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB2_8
+; GISEL-NEXT: ; %bb.5: ; %LeafBlock
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 55, v8
+; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT: s_cbranch_execz .LBB2_7
+; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default
+; GISEL-NEXT: v_sub_u32_e32 v14, 0x49, v9
+; GISEL-NEXT: v_sub_u32_e32 v10, 64, v14
+; GISEL-NEXT: v_lshrrev_b64 v[4:5], v14, v[0:1]
+; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3]
+; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v14
+; GISEL-NEXT: v_or_b32_e32 v10, v4, v10
+; GISEL-NEXT: v_or_b32_e32 v11, v5, v11
+; GISEL-NEXT: v_lshrrev_b64 v[4:5], v15, v[2:3]
+; GISEL-NEXT: v_lshrrev_b64 v[12:13], v14, v[2:3]
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14
+; GISEL-NEXT: v_add_u32_e32 v14, 55, v9
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc
+; GISEL-NEXT: v_sub_u32_e32 v11, 64, v14
+; GISEL-NEXT: v_cndmask_b32_e64 v13, v4, v0, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v4, v5, v1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v12, vcc
+; GISEL-NEXT: v_lshrrev_b64 v[9:10], v14, -1
+; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, -1
+; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v14
+; GISEL-NEXT: v_or_b32_e32 v16, v9, v11
+; GISEL-NEXT: v_or_b32_e32 v17, v10, v12
+; GISEL-NEXT: v_lshrrev_b64 v[11:12], v15, -1
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14
+; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v16, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v17, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14
+; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v11, v11, -1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v12, v12, -1, s[4:5]
+; GISEL-NEXT: v_and_b32_e32 v2, v9, v2
+; GISEL-NEXT: v_and_b32_e32 v3, v10, v3
+; GISEL-NEXT: v_and_or_b32 v0, v11, v0, v2
+; GISEL-NEXT: v_and_or_b32 v1, v12, v1, v3
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v3, v13, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GISEL-NEXT: v_mov_b32_e32 v1, v4
+; GISEL-NEXT: v_mov_b32_e32 v2, v5
+; GISEL-NEXT: v_mov_b32_e32 v3, v6
+; GISEL-NEXT: .LBB2_7: ; %Flow1
+; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: .LBB2_8: ; %Flow2
+; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11]
+; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb
+; GISEL-NEXT: v_lshlrev_b64 v[9:10], 1, v[0:1]
+; GISEL-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v1
+; GISEL-NEXT: v_or_b32_e32 v11, v2, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, v9
+; GISEL-NEXT: v_mov_b32_e32 v1, v10
+; GISEL-NEXT: v_mov_b32_e32 v2, v11
+; GISEL-NEXT: v_mov_b32_e32 v3, v12
+; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog
+; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT: v_bfe_u32 v3, v0, 2, 1
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v3
+; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
+; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v9, 0
+; GISEL-NEXT: v_and_b32_e32 v10, 0x800000, v1
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[9:10]
+; GISEL-NEXT: v_lshl_or_b32 v10, v2, 30, v5
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20
+; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v7, v8
+; GISEL-NEXT: v_lshl_or_b32 v10, v2, 29, v5
+; GISEL-NEXT: ; %bb.12: ; %Flow
+; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT: .LBB2_13: ; %Flow4
+; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT: v_and_b32_e32 v0, 0x80000000, v6
+; GISEL-NEXT: v_mov_b32_e32 v1, 0x3ff00000
+; GISEL-NEXT: v_mov_b32_e32 v2, 0xfffff
+; GISEL-NEXT: v_lshl_add_u32 v1, v7, 20, v1
+; GISEL-NEXT: v_and_or_b32 v2, v10, v2, v0
+; GISEL-NEXT: v_and_or_b32 v0, v4, -1, 0
+; GISEL-NEXT: v_or3_b32 v1, v2, v1, 0
+; GISEL-NEXT: .LBB2_14: ; %Flow5
+; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %cvt = sitofp i128 %x to double
+ ret double %cvt
+}
+
+define double @uitofp_i128_to_f64(i128 %x) {
+; SDAG-LABEL: uitofp_i128_to_f64:
+; SDAG: ; %bb.0: ; %itofp-entry
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_or_b32_e32 v5, v1, v3
+; SDAG-NEXT: v_or_b32_e32 v4, v0, v2
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0
+; SDAG-NEXT: v_mov_b32_e32 v5, 0
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT: s_cbranch_execz .LBB3_14
+; SDAG-NEXT: ; %bb.1: ; %itofp-if-end
+; SDAG-NEXT: v_ffbh_u32_e32 v4, v2
+; SDAG-NEXT: v_add_u32_e32 v4, 32, v4
+; SDAG-NEXT: v_ffbh_u32_e32 v5, v3
+; SDAG-NEXT: v_min_u32_e32 v4, v4, v5
+; SDAG-NEXT: v_ffbh_u32_e32 v5, v0
+; SDAG-NEXT: v_add_u32_e32 v5, 32, v5
+; SDAG-NEXT: v_ffbh_u32_e32 v6, v1
+; SDAG-NEXT: v_min_u32_e32 v5, v5, v6
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT: v_add_u32_e32 v5, 64, v5
+; SDAG-NEXT: v_cndmask_b32_e32 v8, v5, v4, vcc
+; SDAG-NEXT: v_sub_u32_e32 v7, 0x80, v8
+; SDAG-NEXT: v_sub_u32_e32 v6, 0x7f, v8
+; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 54, v7
+; SDAG-NEXT: ; implicit-def: $vgpr9
+; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; SDAG-NEXT: ; %bb.2: ; %itofp-if-else
+; SDAG-NEXT: v_add_u32_e32 v2, 0xffffffb5, v8
+; SDAG-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
+; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
+; SDAG-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
+; SDAG-NEXT: ; implicit-def: $vgpr7
+; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT: ; implicit-def: $vgpr8
+; SDAG-NEXT: ; %bb.3: ; %Flow3
+; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB3_13
+; SDAG-NEXT: ; %bb.4: ; %NodeBlock
+; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 54, v7
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB3_8
+; SDAG-NEXT: ; %bb.5: ; %LeafBlock
+; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 55, v7
+; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc
+; SDAG-NEXT: s_cbranch_execz .LBB3_7
+; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default
+; SDAG-NEXT: v_sub_u32_e32 v11, 0x49, v8
+; SDAG-NEXT: v_sub_u32_e32 v9, 64, v11
+; SDAG-NEXT: v_lshrrev_b64 v[4:5], v11, v[0:1]
+; SDAG-NEXT: v_lshlrev_b64 v[9:10], v9, v[2:3]
+; SDAG-NEXT: v_sub_u32_e32 v12, 9, v8
+; SDAG-NEXT: v_or_b32_e32 v10, v5, v10
+; SDAG-NEXT: v_or_b32_e32 v9, v4, v9
+; SDAG-NEXT: v_lshrrev_b64 v[4:5], v12, v[2:3]
+; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11
+; SDAG-NEXT: v_add_u32_e32 v15, 55, v8
+; SDAG-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11
+; SDAG-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
+; SDAG-NEXT: v_lshrrev_b64 v[9:10], v11, v[2:3]
+; SDAG-NEXT: v_lshrrev_b64 v[11:12], v12, v[0:1]
+; SDAG-NEXT: v_lshlrev_b64 v[13:14], v15, v[2:3]
+; SDAG-NEXT: v_add_u32_e32 v8, -9, v8
+; SDAG-NEXT: v_or_b32_e32 v14, v14, v12
+; SDAG-NEXT: v_or_b32_e32 v13, v13, v11
+; SDAG-NEXT: v_lshlrev_b64 v[11:12], v8, v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc
+; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v15
+; SDAG-NEXT: v_cndmask_b32_e64 v5, v5, v1, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v4, v4, v0, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e32 v8, v12, v14, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v15
+; SDAG-NEXT: v_lshlrev_b64 v[0:1], v15, v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v8, v3, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e32 v8, v11, v13, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v8, v2, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT: v_or_b32_e32 v1, v1, v3
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v2
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT: v_mov_b32_e32 v2, v9
+; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v4, v4, v0
+; SDAG-NEXT: v_mov_b32_e32 v0, v4
+; SDAG-NEXT: v_mov_b32_e32 v1, v5
+; SDAG-NEXT: v_mov_b32_e32 v3, v10
+; SDAG-NEXT: .LBB3_7: ; %Flow1
+; SDAG-NEXT: s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT: .LBB3_8: ; %Flow2
+; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11]
+; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb
+; SDAG-NEXT: v_lshlrev_b64 v[2:3], 1, v[2:3]
+; SDAG-NEXT: v_lshrrev_b32_e32 v3, 31, v1
+; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
+; SDAG-NEXT: v_or_b32_e32 v2, v2, v3
+; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT: v_lshrrev_b32_e32 v3, 2, v0
+; SDAG-NEXT: v_and_or_b32 v0, v3, 1, v0
+; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
+; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; SDAG-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; SDAG-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1]
+; SDAG-NEXT: v_and_b32_e32 v3, 0x800000, v1
+; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v3
+; SDAG-NEXT: v_alignbit_b32 v9, v2, v1, 2
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20
+; SDAG-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1]
+; SDAG-NEXT: v_alignbit_b32 v9, v2, v1, 3
+; SDAG-NEXT: v_mov_b32_e32 v6, v7
+; SDAG-NEXT: ; %bb.12: ; %Flow
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT: .LBB3_13: ; %Flow4
+; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT: v_and_b32_e32 v0, 0xfffff, v9
+; SDAG-NEXT: v_lshl_or_b32 v0, v6, 20, v0
+; SDAG-NEXT: v_add_u32_e32 v5, 0x3ff00000, v0
+; SDAG-NEXT: .LBB3_14: ; %Flow5
+; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v0, v4
+; SDAG-NEXT: v_mov_b32_e32 v1, v5
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: uitofp_i128_to_f64:
+; GISEL: ; %bb.0: ; %itofp-entry
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: s_mov_b64 s[4:5], 0
+; GISEL-NEXT: v_or_b32_e32 v4, v0, v2
+; GISEL-NEXT: v_or_b32_e32 v5, v1, v3
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GISEL-NEXT: v_mov_b32_e32 v4, s4
+; GISEL-NEXT: v_mov_b32_e32 v5, s5
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT: s_cbranch_execz .LBB3_14
+; GISEL-NEXT: ; %bb.1: ; %itofp-if-end
+; GISEL-NEXT: v_ffbh_u32_e32 v5, v0
+; GISEL-NEXT: v_ffbh_u32_e32 v4, v1
+; GISEL-NEXT: v_add_u32_e32 v5, 32, v5
+; GISEL-NEXT: v_ffbh_u32_e32 v6, v2
+; GISEL-NEXT: v_min_u32_e32 v4, v4, v5
+; GISEL-NEXT: v_ffbh_u32_e32 v5, v3
+; GISEL-NEXT: v_add_u32_e32 v6, 32, v6
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT: v_add_u32_e32 v4, 64, v4
+; GISEL-NEXT: v_min_u32_e32 v5, v5, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v5, v4, vcc
+; GISEL-NEXT: v_sub_u32_e32 v7, 0x80, v8
+; GISEL-NEXT: v_sub_u32_e32 v6, 0x7f, v8
+; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 53, v7
+; GISEL-NEXT: ; implicit-def: $vgpr9
+; GISEL-NEXT: ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GISEL-NEXT: ; %bb.2: ; %itofp-if-else
+; GISEL-NEXT: v_add_u32_e32 v2, 0xffffffb5, v8
+; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v1, vcc
+; GISEL-NEXT: ; implicit-def: $vgpr7
+; GISEL-NEXT: ; implicit-def: $vgpr0
+; GISEL-NEXT: ; implicit-def: $vgpr8
+; GISEL-NEXT: ; %bb.3: ; %Flow3
+; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB3_13
+; GISEL-NEXT: ; %bb.4: ; %NodeBlock
+; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 55, v7
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB3_8
+; GISEL-NEXT: ; %bb.5: ; %LeafBlock
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 55, v7
+; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT: s_cbranch_execz .LBB3_7
+; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default
+; GISEL-NEXT: v_sub_u32_e32 v13, 0x49, v8
+; GISEL-NEXT: v_sub_u32_e32 v9, 64, v13
+; GISEL-NEXT: v_lshrrev_b64 v[4:5], v13, v[0:1]
+; GISEL-NEXT: v_lshlrev_b64 v[9:10], v9, v[2:3]
+; GISEL-NEXT: v_subrev_u32_e32 v14, 64, v13
+; GISEL-NEXT: v_lshrrev_b64 v[11:12], v13, v[2:3]
+; GISEL-NEXT: v_or_b32_e32 v9, v4, v9
+; GISEL-NEXT: v_or_b32_e32 v10, v5, v10
+; GISEL-NEXT: v_lshrrev_b64 v[4:5], v14, v[2:3]
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13
+; GISEL-NEXT: v_add_u32_e32 v8, 55, v8
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v4, v9, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v10, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13
+; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v11, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v11, 0, v12, vcc
+; GISEL-NEXT: v_sub_u32_e32 v12, 64, v8
+; GISEL-NEXT: v_cndmask_b32_e64 v14, v4, v0, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v9, v5, v1, s[4:5]
+; GISEL-NEXT: v_lshrrev_b64 v[4:5], v8, -1
+; GISEL-NEXT: v_lshlrev_b64 v[12:13], v12, -1
+; GISEL-NEXT: v_subrev_u32_e32 v15, 64, v8
+; GISEL-NEXT: v_or_b32_e32 v16, v4, v12
+; GISEL-NEXT: v_or_b32_e32 v17, v5, v13
+; GISEL-NEXT: v_lshrrev_b64 v[12:13], v15, -1
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v8
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v13, v13, v17, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v8
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v4, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v5, 0, v5, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v8, v12, -1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v12, v13, -1, s[4:5]
+; GISEL-NEXT: v_and_b32_e32 v2, v4, v2
+; GISEL-NEXT: v_and_b32_e32 v3, v5, v3
+; GISEL-NEXT: v_and_or_b32 v0, v8, v0, v2
+; GISEL-NEXT: v_and_or_b32 v1, v12, v1, v3
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v8, v14, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, v8
+; GISEL-NEXT: v_mov_b32_e32 v1, v9
+; GISEL-NEXT: v_mov_b32_e32 v2, v10
+; GISEL-NEXT: v_mov_b32_e32 v3, v11
+; GISEL-NEXT: .LBB3_7: ; %Flow1
+; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: .LBB3_8: ; %Flow2
+; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11]
+; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb
+; GISEL-NEXT: v_lshlrev_b64 v[8:9], 1, v[0:1]
+; GISEL-NEXT: v_lshlrev_b64 v[10:11], 1, v[2:3]
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 31, v1
+; GISEL-NEXT: v_or_b32_e32 v10, v10, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, v8
+; GISEL-NEXT: v_mov_b32_e32 v1, v9
+; GISEL-NEXT: v_mov_b32_e32 v2, v10
+; GISEL-NEXT: v_mov_b32_e32 v3, v11
+; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog
+; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT: v_bfe_u32 v4, v0, 2, 1
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v4
+; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
+; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT: v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
+; GISEL-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT: v_mov_b32_e32 v8, 0
+; GISEL-NEXT: v_and_b32_e32 v9, 0x800000, v1
+; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1]
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[8:9]
+; GISEL-NEXT: v_lshlrev_b64 v[8:9], 30, v[2:3]
+; GISEL-NEXT: v_lshrrev_b32_e32 v5, 2, v1
+; GISEL-NEXT: v_or_b32_e32 v9, v5, v8
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20
+; GISEL-NEXT: v_lshlrev_b64 v[2:3], 29, v[2:3]
+; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1]
+; GISEL-NEXT: v_lshrrev_b32_e32 v0, 3, v1
+; GISEL-NEXT: v_or_b32_e32 v9, v0, v2
+; GISEL-NEXT: v_mov_b32_e32 v6, v7
+; GISEL-NEXT: ; %bb.12: ; %Flow
+; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT: .LBB3_13: ; %Flow4
+; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT: v_mov_b32_e32 v0, 0x3ff00000
+; GISEL-NEXT: v_lshl_add_u32 v0, v6, 20, v0
+; GISEL-NEXT: v_and_b32_e32 v1, 0xfffff, v9
+; GISEL-NEXT: v_and_or_b32 v4, v4, -1, 0
+; GISEL-NEXT: v_or3_b32 v5, v1, v0, 0
+; GISEL-NEXT: .LBB3_14: ; %Flow5
+; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT: v_mov_b32_e32 v0, v4
+; GISEL-NEXT: v_mov_b32_e32 v1, v5
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %cvt = uitofp i128 %x to double
+ ret double %cvt
+}
+
+define half @sitofp_i128_to_f16(i128 %x) {
+; SDAG-LABEL: sitofp_i128_to_f16:
+; SDAG: ; %bb.0: ; %itofp-entry
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_or_b32_e32 v5, v1, v3
+; SDAG-NEXT: v_or_b32_e32 v4, v0, v2
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT: s_cbranch_execz .LBB4_14
+; SDAG-NEXT: ; %bb.1: ; %itofp-if-end
+; SDAG-NEXT: v_ashrrev_i32_e32 v5, 31, v3
+; SDAG-NEXT: v_xor_b32_e32 v0, v5, v0
+; SDAG-NEXT: v_xor_b32_e32 v1, v5, v1
+; SDAG-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v5
+; SDAG-NEXT: v_xor_b32_e32 v2, v5, v2
+; SDAG-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v5, vcc
+; SDAG-NEXT: v_xor_b32_e32 v6, v5, v3
+; SDAG-NEXT: v_subb_co_u32_e32 v4, vcc, v2, v5, vcc
+; SDAG-NEXT: v_subb_co_u32_e32 v5, vcc, v6, v5, vcc
+; SDAG-NEXT: v_ffbh_u32_e32 v2, v4
+; SDAG-NEXT: v_add_u32_e32 v2, 32, v2
+; SDAG-NEXT: v_ffbh_u32_e32 v6, v5
+; SDAG-NEXT: v_min_u32_e32 v2, v2, v6
+; SDAG-NEXT: v_ffbh_u32_e32 v6, v0
+; SDAG-NEXT: v_add_u32_e32 v6, 32, v6
+; SDAG-NEXT: v_ffbh_u32_e32 v7, v1
+; SDAG-NEXT: v_min_u32_e32 v6, v6, v7
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT: v_add_u32_e32 v6, 64, v6
+; SDAG-NEXT: v_cndmask_b32_e32 v7, v6, v2, vcc
+; SDAG-NEXT: v_sub_u32_e32 v6, 0x80, v7
+; SDAG-NEXT: v_sub_u32_e32 v2, 0x7f, v7
+; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v6
+; SDAG-NEXT: ; implicit-def: $vgpr8
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; SDAG-NEXT: ; %bb.2: ; %itofp-if-else
+; SDAG-NEXT: v_add_u32_e32 v4, 0xffffff98, v7
+; SDAG-NEXT: v_lshlrev_b64 v[0:1], v4, v[0:1]
+; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4
+; SDAG-NEXT: v_cndmask_b32_e32 v8, 0, v0, vcc
+; SDAG-NEXT: ; implicit-def: $vgpr6
+; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT: ; implicit-def: $vgpr7
+; SDAG-NEXT: ; implicit-def: $vgpr4_vgpr5
+; SDAG-NEXT: ; %bb.3: ; %Flow3
+; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB4_13
+; SDAG-NEXT: ; %bb.4: ; %NodeBlock
+; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v6
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB4_8
+; SDAG-NEXT: ; %bb.5: ; %LeafBlock
+; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v6
+; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc
+; SDAG-NEXT: s_cbranch_execz .LBB4_7
+; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default
+; SDAG-NEXT: v_sub_u32_e32 v12, 0x66, v7
+; SDAG-NEXT: v_sub_u32_e32 v10, 64, v12
+; SDAG-NEXT: v_lshrrev_b64 v[8:9], v12, v[0:1]
+; SDAG-NEXT: v_lshlrev_b64 v[10:11], v10, v[4:5]
+; SDAG-NEXT: v_sub_u32_e32 v13, 38, v7
+; SDAG-NEXT: v_or_b32_e32 v11, v9, v11
+; SDAG-NEXT: v_or_b32_e32 v10, v8, v10
+; SDAG-NEXT: v_lshrrev_b64 v[8:9], v13, v[4:5]
+; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v12
+; SDAG-NEXT: v_add_u32_e32 v14, 26, v7
+; SDAG-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v12
+; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc
+; SDAG-NEXT: v_lshrrev_b64 v[10:11], v13, v[0:1]
+; SDAG-NEXT: v_lshlrev_b64 v[12:13], v14, v[4:5]
+; SDAG-NEXT: v_subrev_u32_e32 v7, 38, v7
+; SDAG-NEXT: v_cndmask_b32_e64 v15, v8, v0, s[4:5]
+; SDAG-NEXT: v_lshlrev_b64 v[7:8], v7, v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v9, v9, v1, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v11, v13, v11
+; SDAG-NEXT: v_or_b32_e32 v10, v12, v10
+; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v14
+; SDAG-NEXT: v_lshlrev_b64 v[0:1], v14, v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v11, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v14
+; SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v5, v8, v5, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v4, v7, v4, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT: v_or_b32_e32 v1, v1, v5
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v4
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v8, v15, v0
+; SDAG-NEXT: v_mov_b32_e32 v0, v8
+; SDAG-NEXT: v_mov_b32_e32 v1, v9
+; SDAG-NEXT: .LBB4_7: ; %Flow1
+; SDAG-NEXT: s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT: .LBB4_8: ; %Flow2
+; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11]
+; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb
+; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
+; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT: v_lshrrev_b32_e32 v4, 2, v0
+; SDAG-NEXT: v_and_or_b32 v0, v4, 1, v0
+; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
+; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; SDAG-NEXT: v_and_b32_e32 v4, 0x4000000, v0
+; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v4
+; SDAG-NEXT: v_alignbit_b32 v8, v1, v0, 2
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20
+; SDAG-NEXT: v_alignbit_b32 v8, v1, v0, 3
+; SDAG-NEXT: v_mov_b32_e32 v2, v6
+; SDAG-NEXT: ; %bb.12: ; %Flow
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT: .LBB4_13: ; %Flow4
+; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT: v_and_b32_e32 v0, 0x80000000, v3
+; SDAG-NEXT: v_lshl_add_u32 v1, v2, 23, 1.0
+; SDAG-NEXT: v_and_b32_e32 v2, 0x7fffff, v8
+; SDAG-NEXT: v_or3_b32 v0, v2, v0, v1
+; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v0
+; SDAG-NEXT: .LBB4_14: ; %Flow5
+; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v0, v4
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: sitofp_i128_to_f16:
+; GISEL: ; %bb.0: ; %itofp-entry
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_or_b32_e32 v4, v0, v2
+; GISEL-NEXT: v_or_b32_e32 v5, v1, v3
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GISEL-NEXT: v_mov_b32_e32 v4, s4
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT: s_cbranch_execz .LBB4_14
+; GISEL-NEXT: ; %bb.1: ; %itofp-if-end
+; GISEL-NEXT: v_ashrrev_i32_e32 v6, 31, v3
+; GISEL-NEXT: v_xor_b32_e32 v0, v6, v0
+; GISEL-NEXT: v_xor_b32_e32 v1, v6, v1
+; GISEL-NEXT: v_sub_co_u32_e32 v0, vcc, v0, v6
+; GISEL-NEXT: v_xor_b32_e32 v2, v6, v2
+; GISEL-NEXT: v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
+; GISEL-NEXT: v_xor_b32_e32 v3, v6, v3
+; GISEL-NEXT: v_subb_co_u32_e32 v2, vcc, v2, v6, vcc
+; GISEL-NEXT: v_ffbh_u32_e32 v5, v0
+; GISEL-NEXT: v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
+; GISEL-NEXT: v_ffbh_u32_e32 v4, v1
+; GISEL-NEXT: v_add_u32_e32 v5, 32, v5
+; GISEL-NEXT: v_ffbh_u32_e32 v7, v2
+; GISEL-NEXT: v_min_u32_e32 v4, v4, v5
+; GISEL-NEXT: v_ffbh_u32_e32 v5, v3
+; GISEL-NEXT: v_add_u32_e32 v7, 32, v7
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT: v_add_u32_e32 v4, 64, v4
+; GISEL-NEXT: v_min_u32_e32 v5, v5, v7
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc
+; GISEL-NEXT: v_sub_u32_e32 v8, 0x80, v5
+; GISEL-NEXT: v_sub_u32_e32 v7, 0x7f, v5
+; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v8
+; GISEL-NEXT: ; implicit-def: $vgpr4
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GISEL-NEXT: ; %bb.2: ; %itofp-if-else
+; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v5
+; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
+; GISEL-NEXT: ; implicit-def: $vgpr8
+; GISEL-NEXT: ; implicit-def: $vgpr0
+; GISEL-NEXT: ; implicit-def: $vgpr5
+; GISEL-NEXT: ; implicit-def: $vgpr2
+; GISEL-NEXT: ; %bb.3: ; %Flow3
+; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB4_13
+; GISEL-NEXT: ; %bb.4: ; %NodeBlock
+; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v8
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB4_8
+; GISEL-NEXT: ; %bb.5: ; %LeafBlock
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v8
+; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT: s_cbranch_execz .LBB4_7
+; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default
+; GISEL-NEXT: v_sub_u32_e32 v4, 0x66, v5
+; GISEL-NEXT: v_sub_u32_e32 v11, 64, v4
+; GISEL-NEXT: v_lshrrev_b64 v[9:10], v4, v[0:1]
+; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, v[2:3]
+; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v4
+; GISEL-NEXT: v_or_b32_e32 v11, v9, v11
+; GISEL-NEXT: v_or_b32_e32 v12, v10, v12
+; GISEL-NEXT: v_lshrrev_b64 v[9:10], v13, v[2:3]
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4
+; GISEL-NEXT: v_add_u32_e32 v5, 26, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v12, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GISEL-NEXT: v_sub_u32_e32 v11, 64, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v13, v9, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v10, v1, vcc
+; GISEL-NEXT: v_lshrrev_b64 v[9:10], v5, -1
+; GISEL-NEXT: v_lshlrev_b64 v[11:12], v11, -1
+; GISEL-NEXT: v_subrev_u32_e32 v14, 64, v5
+; GISEL-NEXT: v_or_b32_e32 v15, v9, v11
+; GISEL-NEXT: v_or_b32_e32 v16, v10, v12
+; GISEL-NEXT: v_lshrrev_b64 v[11:12], v14, -1
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v12, v16, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v10, 0, v10, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v11, -1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v11, v12, -1, s[4:5]
+; GISEL-NEXT: v_and_b32_e32 v2, v9, v2
+; GISEL-NEXT: v_and_b32_e32 v3, v10, v3
+; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2
+; GISEL-NEXT: v_and_or_b32 v1, v11, v1, v3
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v3, v13, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GISEL-NEXT: v_mov_b32_e32 v1, v4
+; GISEL-NEXT: v_mov_b32_e32 v2, v5
+; GISEL-NEXT: v_mov_b32_e32 v3, v6
+; GISEL-NEXT: .LBB4_7: ; %Flow1
+; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: .LBB4_8: ; %Flow2
+; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11]
+; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb
+; GISEL-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog
+; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT: v_bfe_u32 v2, v0, 2, 1
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
+; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT: v_and_b32_e32 v2, 0x4000000, v0
+; GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1]
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20
+; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v7, v8
+; GISEL-NEXT: ; %bb.12: ; %Flow
+; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT: .LBB4_13: ; %Flow4
+; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT: v_and_b32_e32 v0, 0x80000000, v6
+; GISEL-NEXT: v_lshl_add_u32 v1, v7, 23, 1.0
+; GISEL-NEXT: v_and_b32_e32 v2, 0x7fffff, v4
+; GISEL-NEXT: v_or3_b32 v0, v2, v0, v1
+; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GISEL-NEXT: .LBB4_14: ; %Flow5
+; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT: v_mov_b32_e32 v0, v4
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %cvt = sitofp i128 %x to half
+ ret half %cvt
+}
+
+define half @uitofp_i128_to_f16(i128 %x) {
+; SDAG-LABEL: uitofp_i128_to_f16:
+; SDAG: ; %bb.0: ; %itofp-entry
+; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-NEXT: v_or_b32_e32 v5, v1, v3
+; SDAG-NEXT: v_or_b32_e32 v4, v0, v2
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; SDAG-NEXT: v_mov_b32_e32 v4, 0
+; SDAG-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; SDAG-NEXT: s_cbranch_execz .LBB5_14
+; SDAG-NEXT: ; %bb.1: ; %itofp-if-end
+; SDAG-NEXT: v_ffbh_u32_e32 v4, v2
+; SDAG-NEXT: v_add_u32_e32 v4, 32, v4
+; SDAG-NEXT: v_ffbh_u32_e32 v5, v3
+; SDAG-NEXT: v_min_u32_e32 v4, v4, v5
+; SDAG-NEXT: v_ffbh_u32_e32 v5, v0
+; SDAG-NEXT: v_add_u32_e32 v5, 32, v5
+; SDAG-NEXT: v_ffbh_u32_e32 v6, v1
+; SDAG-NEXT: v_min_u32_e32 v5, v5, v6
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; SDAG-NEXT: v_add_u32_e32 v5, 64, v5
+; SDAG-NEXT: v_cndmask_b32_e32 v6, v5, v4, vcc
+; SDAG-NEXT: v_sub_u32_e32 v5, 0x80, v6
+; SDAG-NEXT: v_sub_u32_e32 v4, 0x7f, v6
+; SDAG-NEXT: v_cmp_gt_i32_e32 vcc, 25, v5
+; SDAG-NEXT: ; implicit-def: $vgpr7
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; SDAG-NEXT: ; %bb.2: ; %itofp-if-else
+; SDAG-NEXT: v_add_u32_e32 v2, 0xffffff98, v6
+; SDAG-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
+; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
+; SDAG-NEXT: v_cndmask_b32_e32 v7, 0, v0, vcc
+; SDAG-NEXT: ; implicit-def: $vgpr5
+; SDAG-NEXT: ; implicit-def: $vgpr0_vgpr1
+; SDAG-NEXT: ; implicit-def: $vgpr6
+; SDAG-NEXT: ; implicit-def: $vgpr2_vgpr3
+; SDAG-NEXT: ; %bb.3: ; %Flow3
+; SDAG-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB5_13
+; SDAG-NEXT: ; %bb.4: ; %NodeBlock
+; SDAG-NEXT: v_cmp_lt_i32_e32 vcc, 25, v5
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
+; SDAG-NEXT: s_cbranch_execz .LBB5_8
+; SDAG-NEXT: ; %bb.5: ; %LeafBlock
+; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 26, v5
+; SDAG-NEXT: s_and_saveexec_b64 s[12:13], vcc
+; SDAG-NEXT: s_cbranch_execz .LBB5_7
+; SDAG-NEXT: ; %bb.6: ; %itofp-sw-default
+; SDAG-NEXT: v_sub_u32_e32 v11, 0x66, v6
+; SDAG-NEXT: v_sub_u32_e32 v9, 64, v11
+; SDAG-NEXT: v_lshrrev_b64 v[7:8], v11, v[0:1]
+; SDAG-NEXT: v_lshlrev_b64 v[9:10], v9, v[2:3]
+; SDAG-NEXT: v_sub_u32_e32 v12, 38, v6
+; SDAG-NEXT: v_or_b32_e32 v10, v8, v10
+; SDAG-NEXT: v_or_b32_e32 v9, v7, v9
+; SDAG-NEXT: v_lshrrev_b64 v[7:8], v12, v[2:3]
+; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v11
+; SDAG-NEXT: v_add_u32_e32 v13, 26, v6
+; SDAG-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v11
+; SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v9, vcc
+; SDAG-NEXT: v_lshrrev_b64 v[9:10], v12, v[0:1]
+; SDAG-NEXT: v_lshlrev_b64 v[11:12], v13, v[2:3]
+; SDAG-NEXT: v_subrev_u32_e32 v6, 38, v6
+; SDAG-NEXT: v_cndmask_b32_e64 v14, v7, v0, s[4:5]
+; SDAG-NEXT: v_lshlrev_b64 v[6:7], v6, v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v8, v8, v1, s[4:5]
+; SDAG-NEXT: v_or_b32_e32 v10, v12, v10
+; SDAG-NEXT: v_or_b32_e32 v9, v11, v9
+; SDAG-NEXT: v_cmp_gt_u32_e32 vcc, 64, v13
+; SDAG-NEXT: v_lshlrev_b64 v[0:1], v13, v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e32 v7, v7, v10, vcc
+; SDAG-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v13
+; SDAG-NEXT: v_cndmask_b32_e32 v6, v6, v9, vcc
+; SDAG-NEXT: v_cndmask_b32_e64 v3, v7, v3, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e64 v2, v6, v2, s[4:5]
+; SDAG-NEXT: v_cndmask_b32_e32 v1, 0, v1, vcc
+; SDAG-NEXT: v_cndmask_b32_e32 v0, 0, v0, vcc
+; SDAG-NEXT: v_or_b32_e32 v1, v1, v3
+; SDAG-NEXT: v_or_b32_e32 v0, v0, v2
+; SDAG-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; SDAG-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; SDAG-NEXT: v_or_b32_e32 v7, v14, v0
+; SDAG-NEXT: v_mov_b32_e32 v0, v7
+; SDAG-NEXT: v_mov_b32_e32 v1, v8
+; SDAG-NEXT: .LBB5_7: ; %Flow1
+; SDAG-NEXT: s_or_b64 exec, exec, s[12:13]
+; SDAG-NEXT: .LBB5_8: ; %Flow2
+; SDAG-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11]
+; SDAG-NEXT: ; %bb.9: ; %itofp-sw-bb
+; SDAG-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
+; SDAG-NEXT: ; %bb.10: ; %itofp-sw-epilog
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT: v_lshrrev_b32_e32 v2, 2, v0
+; SDAG-NEXT: v_and_or_b32 v0, v2, 1, v0
+; SDAG-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
+; SDAG-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; SDAG-NEXT: v_and_b32_e32 v2, 0x4000000, v0
+; SDAG-NEXT: v_cmp_ne_u32_e32 vcc, 0, v2
+; SDAG-NEXT: v_alignbit_b32 v7, v1, v0, 2
+; SDAG-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; SDAG-NEXT: ; %bb.11: ; %itofp-if-then20
+; SDAG-NEXT: v_alignbit_b32 v7, v1, v0, 3
+; SDAG-NEXT: v_mov_b32_e32 v4, v5
+; SDAG-NEXT: ; %bb.12: ; %Flow
+; SDAG-NEXT: s_or_b64 exec, exec, s[4:5]
+; SDAG-NEXT: .LBB5_13: ; %Flow4
+; SDAG-NEXT: s_or_b64 exec, exec, s[8:9]
+; SDAG-NEXT: v_and_b32_e32 v0, 0x7fffff, v7
+; SDAG-NEXT: v_lshl_or_b32 v0, v4, 23, v0
+; SDAG-NEXT: v_add_u32_e32 v0, 1.0, v0
+; SDAG-NEXT: v_cvt_f16_f32_e32 v4, v0
+; SDAG-NEXT: .LBB5_14: ; %Flow5
+; SDAG-NEXT: s_or_b64 exec, exec, s[6:7]
+; SDAG-NEXT: v_mov_b32_e32 v0, v4
+; SDAG-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-LABEL: uitofp_i128_to_f16:
+; GISEL: ; %bb.0: ; %itofp-entry
+; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-NEXT: v_or_b32_e32 v4, v0, v2
+; GISEL-NEXT: v_or_b32_e32 v5, v1, v3
+; GISEL-NEXT: s_mov_b32 s4, 0
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[4:5]
+; GISEL-NEXT: v_mov_b32_e32 v4, s4
+; GISEL-NEXT: s_and_saveexec_b64 s[6:7], vcc
+; GISEL-NEXT: s_cbranch_execz .LBB5_14
+; GISEL-NEXT: ; %bb.1: ; %itofp-if-end
+; GISEL-NEXT: v_ffbh_u32_e32 v5, v0
+; GISEL-NEXT: v_ffbh_u32_e32 v4, v1
+; GISEL-NEXT: v_add_u32_e32 v5, 32, v5
+; GISEL-NEXT: v_ffbh_u32_e32 v6, v2
+; GISEL-NEXT: v_min_u32_e32 v4, v4, v5
+; GISEL-NEXT: v_ffbh_u32_e32 v5, v3
+; GISEL-NEXT: v_add_u32_e32 v6, 32, v6
+; GISEL-NEXT: v_cmp_eq_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT: v_add_u32_e32 v4, 64, v4
+; GISEL-NEXT: v_min_u32_e32 v5, v5, v6
+; GISEL-NEXT: v_cndmask_b32_e32 v5, v5, v4, vcc
+; GISEL-NEXT: v_sub_u32_e32 v7, 0x80, v5
+; GISEL-NEXT: v_sub_u32_e32 v6, 0x7f, v5
+; GISEL-NEXT: v_cmp_ge_i32_e32 vcc, 24, v7
+; GISEL-NEXT: ; implicit-def: $vgpr4
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: s_xor_b64 s[4:5], exec, s[4:5]
+; GISEL-NEXT: ; %bb.2: ; %itofp-if-else
+; GISEL-NEXT: v_add_u32_e32 v2, 0xffffff98, v5
+; GISEL-NEXT: v_lshlrev_b64 v[0:1], v2, v[0:1]
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v2
+; GISEL-NEXT: v_cndmask_b32_e32 v4, 0, v0, vcc
+; GISEL-NEXT: ; implicit-def: $vgpr7
+; GISEL-NEXT: ; implicit-def: $vgpr0
+; GISEL-NEXT: ; implicit-def: $vgpr5
+; GISEL-NEXT: ; implicit-def: $vgpr2
+; GISEL-NEXT: ; %bb.3: ; %Flow3
+; GISEL-NEXT: s_andn2_saveexec_b64 s[8:9], s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB5_13
+; GISEL-NEXT: ; %bb.4: ; %NodeBlock
+; GISEL-NEXT: v_cmp_le_i32_e32 vcc, 26, v7
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: s_xor_b64 s[10:11], exec, s[4:5]
+; GISEL-NEXT: s_cbranch_execz .LBB5_8
+; GISEL-NEXT: ; %bb.5: ; %LeafBlock
+; GISEL-NEXT: v_cmp_ne_u32_e32 vcc, 26, v7
+; GISEL-NEXT: s_and_saveexec_b64 s[12:13], vcc
+; GISEL-NEXT: s_cbranch_execz .LBB5_7
+; GISEL-NEXT: ; %bb.6: ; %itofp-sw-default
+; GISEL-NEXT: v_sub_u32_e32 v4, 0x66, v5
+; GISEL-NEXT: v_sub_u32_e32 v10, 64, v4
+; GISEL-NEXT: v_lshrrev_b64 v[8:9], v4, v[0:1]
+; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, v[2:3]
+; GISEL-NEXT: v_subrev_u32_e32 v12, 64, v4
+; GISEL-NEXT: v_or_b32_e32 v10, v8, v10
+; GISEL-NEXT: v_or_b32_e32 v11, v9, v11
+; GISEL-NEXT: v_lshrrev_b64 v[8:9], v12, v[2:3]
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v4
+; GISEL-NEXT: v_add_u32_e32 v5, 26, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e32 vcc, 0, v4
+; GISEL-NEXT: v_sub_u32_e32 v10, 64, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v12, v8, v0, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v4, v9, v1, vcc
+; GISEL-NEXT: v_lshrrev_b64 v[8:9], v5, -1
+; GISEL-NEXT: v_lshlrev_b64 v[10:11], v10, -1
+; GISEL-NEXT: v_subrev_u32_e32 v13, 64, v5
+; GISEL-NEXT: v_or_b32_e32 v14, v8, v10
+; GISEL-NEXT: v_or_b32_e32 v15, v9, v11
+; GISEL-NEXT: v_lshrrev_b64 v[10:11], v13, -1
+; GISEL-NEXT: v_cmp_gt_u32_e32 vcc, 64, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v10, v10, v14, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v11, v11, v15, vcc
+; GISEL-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v5
+; GISEL-NEXT: v_cndmask_b32_e32 v8, 0, v8, vcc
+; GISEL-NEXT: v_cndmask_b32_e32 v9, 0, v9, vcc
+; GISEL-NEXT: v_cndmask_b32_e64 v5, v10, -1, s[4:5]
+; GISEL-NEXT: v_cndmask_b32_e64 v10, v11, -1, s[4:5]
+; GISEL-NEXT: v_and_b32_e32 v2, v8, v2
+; GISEL-NEXT: v_and_b32_e32 v3, v9, v3
+; GISEL-NEXT: v_and_or_b32 v0, v5, v0, v2
+; GISEL-NEXT: v_and_or_b32 v1, v10, v1, v3
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[0:1]
+; GISEL-NEXT: v_cndmask_b32_e64 v0, 0, 1, vcc
+; GISEL-NEXT: v_or_b32_e32 v3, v12, v0
+; GISEL-NEXT: v_mov_b32_e32 v0, v3
+; GISEL-NEXT: v_mov_b32_e32 v1, v4
+; GISEL-NEXT: v_mov_b32_e32 v2, v5
+; GISEL-NEXT: v_mov_b32_e32 v3, v6
+; GISEL-NEXT: .LBB5_7: ; %Flow1
+; GISEL-NEXT: s_or_b64 exec, exec, s[12:13]
+; GISEL-NEXT: .LBB5_8: ; %Flow2
+; GISEL-NEXT: s_andn2_saveexec_b64 s[4:5], s[10:11]
+; GISEL-NEXT: ; %bb.9: ; %itofp-sw-bb
+; GISEL-NEXT: v_lshlrev_b64 v[0:1], 1, v[0:1]
+; GISEL-NEXT: ; %bb.10: ; %itofp-sw-epilog
+; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT: v_bfe_u32 v2, v0, 2, 1
+; GISEL-NEXT: v_or_b32_e32 v0, v0, v2
+; GISEL-NEXT: v_add_co_u32_e32 v0, vcc, 1, v0
+; GISEL-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
+; GISEL-NEXT: v_and_b32_e32 v2, 0x4000000, v0
+; GISEL-NEXT: v_mov_b32_e32 v3, 0
+; GISEL-NEXT: v_lshrrev_b64 v[4:5], 2, v[0:1]
+; GISEL-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3]
+; GISEL-NEXT: s_and_saveexec_b64 s[4:5], vcc
+; GISEL-NEXT: ; %bb.11: ; %itofp-if-then20
+; GISEL-NEXT: v_lshrrev_b64 v[4:5], 3, v[0:1]
+; GISEL-NEXT: v_mov_b32_e32 v6, v7
+; GISEL-NEXT: ; %bb.12: ; %Flow
+; GISEL-NEXT: s_or_b64 exec, exec, s[4:5]
+; GISEL-NEXT: .LBB5_13: ; %Flow4
+; GISEL-NEXT: s_or_b64 exec, exec, s[8:9]
+; GISEL-NEXT: v_lshl_add_u32 v0, v6, 23, 1.0
+; GISEL-NEXT: v_mov_b32_e32 v1, 0x7fffff
+; GISEL-NEXT: v_and_or_b32 v0, v4, v1, v0
+; GISEL-NEXT: v_cvt_f16_f32_e32 v4, v0
+; GISEL-NEXT: .LBB5_14: ; %Flow5
+; GISEL-NEXT: s_or_b64 exec, exec, s[6:7]
+; GISEL-NEXT: v_mov_b32_e32 v0, v4
+; GISEL-NEXT: s_setpc_b64 s[30:31]
+ %cvt = uitofp i128 %x to half
+ ret half %cvt
+}
+
+; FIXME: ExpandLargeFpConvert asserts on bfloat
+; define bfloat @sitofp_i128_to_bf16(i128 %x) {
+; %cvt = sitofp i128 %x to bfloat
+; ret bfloat %cvt
+; }
+
+; define bfloat @uitofp_i128_to_bf16(i128 %x) {
+; %cvt = uitofp i128 %x to bfloat
+; ret bfloat %cvt
+; }
+;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; GCN: {{.*}}
diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
index 1acbb09..fbf2ee1 100644
--- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
+++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll
@@ -60,7 +60,6 @@ define amdgpu_kernel void @test_kernel(i32 %val) #0 {
; CHECK-NEXT: ; implicit-def: $sgpr15
; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21]
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
-; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1
diff --git a/llvm/test/CodeGen/AMDGPU/lds-mixed-absolute-addresses-unused.ll b/llvm/test/CodeGen/AMDGPU/lds-mixed-absolute-addresses-unused.ll
new file mode 100644
index 0000000..d101d8d
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lds-mixed-absolute-addresses-unused.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S -mtriple=amdgcn-- -amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
+; RUN: opt -S -mtriple=amdgcn-- -passes=amdgpu-lower-module-lds < %s 2>&1 | FileCheck %s
+
+; This looks like a partially lowered module, but the non-lowered GV isn't used by any kernels.
+; In such cases, LowerModuleLDS is free to leave it in and ignore it, and we want to make sure
+; LowerModuleLDS doesn't crash if it re-runs on such modules.
+@notLowered = addrspace(3) global i32 poison
+@lowered = addrspace(3) global i32 poison, !absolute_symbol !0
+
+@llvm.compiler.used = appending addrspace(1) global [1 x ptr] [ptr addrspacecast (ptr addrspace(3) @notLowered to ptr)], section "llvm.metadata"
+
+define amdgpu_kernel void @kern(i32 %val0) {
+; CHECK-LABEL: define amdgpu_kernel void @kern(
+; CHECK-SAME: i32 [[VAL0:%.*]]) {
+; CHECK-NEXT: [[VAL1:%.*]] = add i32 [[VAL0]], 4
+; CHECK-NEXT: store i32 [[VAL1]], ptr addrspace(3) @lowered, align 4
+; CHECK-NEXT: ret void
+;
+ %val1 = add i32 %val0, 4
+ store i32 %val1, ptr addrspace(3) @lowered
+ ret void
+}
+
+
+!0 = !{i32 0, i32 1}
diff --git a/llvm/test/CodeGen/AMDGPU/lds-reject-mixed-absolute-addresses.ll b/llvm/test/CodeGen/AMDGPU/lds-reject-mixed-absolute-addresses.ll
index b512a43..b1f4f2e 100644
--- a/llvm/test/CodeGen/AMDGPU/lds-reject-mixed-absolute-addresses.ll
+++ b/llvm/test/CodeGen/AMDGPU/lds-reject-mixed-absolute-addresses.ll
@@ -8,7 +8,7 @@
define amdgpu_kernel void @kern() {
%val0 = load i32, ptr addrspace(3) @var1
%val1 = add i32 %val0, 4
- store i32 %val1, ptr addrspace(3) @var1
+ store i32 %val1, ptr addrspace(3) @var2
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
index 5007f77..0ff5dd3 100644
--- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
+++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll
@@ -195,13 +195,13 @@
; GCN-O1-NEXT: Uniformity Analysis
; GCN-O1-NEXT: AMDGPU atomic optimizations
; GCN-O1-NEXT: Expand Atomic instructions
-; GCN-O1-NEXT: AMDGPU Promote Alloca
; GCN-O1-NEXT: Dominator Tree Construction
+; GCN-O1-NEXT: Natural Loop Information
+; GCN-O1-NEXT: AMDGPU Promote Alloca
; GCN-O1-NEXT: Cycle Info Analysis
; GCN-O1-NEXT: Uniformity Analysis
; GCN-O1-NEXT: AMDGPU IR optimizations
; GCN-O1-NEXT: Basic Alias Analysis (stateless AA impl)
-; GCN-O1-NEXT: Natural Loop Information
; GCN-O1-NEXT: Canonicalize natural loops
; GCN-O1-NEXT: Scalar Evolution Analysis
; GCN-O1-NEXT: Loop Pass Manager
@@ -470,9 +470,9 @@
; GCN-O1-OPTS-NEXT: Uniformity Analysis
; GCN-O1-OPTS-NEXT: AMDGPU atomic optimizations
; GCN-O1-OPTS-NEXT: Expand Atomic instructions
-; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca
; GCN-O1-OPTS-NEXT: Dominator Tree Construction
; GCN-O1-OPTS-NEXT: Natural Loop Information
+; GCN-O1-OPTS-NEXT: AMDGPU Promote Alloca
; GCN-O1-OPTS-NEXT: Canonicalize natural loops
; GCN-O1-OPTS-NEXT: Lazy Branch Probability Analysis
; GCN-O1-OPTS-NEXT: Lazy Block Frequency Analysis
@@ -775,9 +775,9 @@
; GCN-O2-NEXT: Uniformity Analysis
; GCN-O2-NEXT: AMDGPU atomic optimizations
; GCN-O2-NEXT: Expand Atomic instructions
-; GCN-O2-NEXT: AMDGPU Promote Alloca
; GCN-O2-NEXT: Dominator Tree Construction
; GCN-O2-NEXT: Natural Loop Information
+; GCN-O2-NEXT: AMDGPU Promote Alloca
; GCN-O2-NEXT: Split GEPs to a variadic base and a constant offset for better CSE
; GCN-O2-NEXT: Scalar Evolution Analysis
; GCN-O2-NEXT: Straight line strength reduction
@@ -1084,9 +1084,9 @@
; GCN-O3-NEXT: Uniformity Analysis
; GCN-O3-NEXT: AMDGPU atomic optimizations
; GCN-O3-NEXT: Expand Atomic instructions
-; GCN-O3-NEXT: AMDGPU Promote Alloca
; GCN-O3-NEXT: Dominator Tree Construction
; GCN-O3-NEXT: Natural Loop Information
+; GCN-O3-NEXT: AMDGPU Promote Alloca
; GCN-O3-NEXT: Split GEPs to a variadic base and a constant offset for better CSE
; GCN-O3-NEXT: Scalar Evolution Analysis
; GCN-O3-NEXT: Straight line strength reduction
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
index b4415c1..f6197e0 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w32.ll
@@ -1,132 +1,44 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-SDAG-W32 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-GISEL-W32 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=+wavefrontsize32,-wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
-declare <2 x i32> @llvm.amdgcn.global.load.tr.v2i32.p1(ptr addrspace(1))
-declare <8 x i16> @llvm.amdgcn.global.load.tr.v8i16.p1(ptr addrspace(1))
-declare <8 x half> @llvm.amdgcn.global.load.tr.v8f16.p1(ptr addrspace(1))
-declare <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16.p1(ptr addrspace(1))
+declare <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1))
+declare <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1))
define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
-; GFX12-SDAG-W32-LABEL: global_load_tr_b64:
-; GFX12-SDAG-W32: ; %bb.0: ; %entry
-; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-SDAG-W32-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-W32-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32
-; GFX12-SDAG-W32-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-W32-NEXT: global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-SDAG-W32-NEXT: s_nop 0
-; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-SDAG-W32-NEXT: s_endpgm
-;
-; GFX12-GISEL-W32-LABEL: global_load_tr_b64:
-; GFX12-GISEL-W32: ; %bb.0: ; %entry
-; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-GISEL-W32-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-W32-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32
-; GFX12-GISEL-W32-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-W32-NEXT: global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-GISEL-W32-NEXT: s_nop 0
-; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-GISEL-W32-NEXT: s_endpgm
+; GFX12-LABEL: global_load_tr_b64:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_tr_b64 v[0:1], v2, s[0:1] offset:32
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
- %val = call <2 x i32> @llvm.amdgcn.global.load.tr.v2i32.p1(ptr addrspace(1) %gep)
+ %val = call <2 x i32> @llvm.amdgcn.global.load.tr.b64.v2i32.p1(ptr addrspace(1) %gep)
store <2 x i32> %val, ptr addrspace(1) %use
ret void
}
-define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
-; GFX12-SDAG-W32-LABEL: global_load_tr_b128_i16:
-; GFX12-SDAG-W32: ; %bb.0: ; %entry
-; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v4, 0
-; GFX12-SDAG-W32-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32
-; GFX12-SDAG-W32-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3]
-; GFX12-SDAG-W32-NEXT: s_nop 0
-; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-SDAG-W32-NEXT: s_endpgm
-;
-; GFX12-GISEL-W32-LABEL: global_load_tr_b128_i16:
-; GFX12-GISEL-W32: ; %bb.0: ; %entry
-; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v4, 0
-; GFX12-GISEL-W32-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32
-; GFX12-GISEL-W32-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3]
-; GFX12-GISEL-W32-NEXT: s_nop 0
-; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-GISEL-W32-NEXT: s_endpgm
+define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX12-LABEL: global_load_tr_b128:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v4, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b128 v4, v[0:3], s[2:3]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
- %val = call <8 x i16> @llvm.amdgcn.global.load.tr.v8i16.p1(ptr addrspace(1) %gep)
+ %val = call <8 x i16> @llvm.amdgcn.global.load.tr.b128.v8i16.p1(ptr addrspace(1) %gep)
store <8 x i16> %val, ptr addrspace(1) %use
ret void
}
-
-define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
-; GFX12-SDAG-W32-LABEL: global_load_tr_b128_half:
-; GFX12-SDAG-W32: ; %bb.0: ; %entry
-; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v4, 0
-; GFX12-SDAG-W32-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32
-; GFX12-SDAG-W32-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3]
-; GFX12-SDAG-W32-NEXT: s_nop 0
-; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-SDAG-W32-NEXT: s_endpgm
-;
-; GFX12-GISEL-W32-LABEL: global_load_tr_b128_half:
-; GFX12-GISEL-W32: ; %bb.0: ; %entry
-; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v4, 0
-; GFX12-GISEL-W32-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32
-; GFX12-GISEL-W32-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3]
-; GFX12-GISEL-W32-NEXT: s_nop 0
-; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-GISEL-W32-NEXT: s_endpgm
-entry:
- %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
- %val = call <8 x half> @llvm.amdgcn.global.load.tr.v8f16.p1(ptr addrspace(1) %gep)
- store <8 x half> %val, ptr addrspace(1) %use
- ret void
-}
-
-define amdgpu_kernel void @global_load_tr_b128_bfloat(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
-; GFX12-SDAG-W32-LABEL: global_load_tr_b128_bfloat:
-; GFX12-SDAG-W32: ; %bb.0: ; %entry
-; GFX12-SDAG-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-SDAG-W32-NEXT: v_mov_b32_e32 v4, 0
-; GFX12-SDAG-W32-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32
-; GFX12-SDAG-W32-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3]
-; GFX12-SDAG-W32-NEXT: s_nop 0
-; GFX12-SDAG-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-SDAG-W32-NEXT: s_endpgm
-;
-; GFX12-GISEL-W32-LABEL: global_load_tr_b128_bfloat:
-; GFX12-GISEL-W32: ; %bb.0: ; %entry
-; GFX12-GISEL-W32-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-GISEL-W32-NEXT: v_mov_b32_e32 v4, 0
-; GFX12-GISEL-W32-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-W32-NEXT: global_load_tr_b128 v[0:3], v4, s[0:1] offset:32
-; GFX12-GISEL-W32-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-W32-NEXT: global_store_b128 v4, v[0:3], s[2:3]
-; GFX12-GISEL-W32-NEXT: s_nop 0
-; GFX12-GISEL-W32-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-GISEL-W32-NEXT: s_endpgm
-entry:
- %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
- %val = call <8 x bfloat> @llvm.amdgcn.global.load.tr.v8bf16.p1(ptr addrspace(1) %gep)
- store <8 x bfloat> %val, ptr addrspace(1) %use
- ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
index 7ad1416..a2dc366 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.global.load.tr-w64.ll
@@ -1,132 +1,44 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-SDAG-W64 %s
-; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefixes=GFX12-GISEL-W64 %s
+; RUN: llc -global-isel=0 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
+; RUN: llc -global-isel=1 -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -mattr=-wavefrontsize32,+wavefrontsize64 < %s | FileCheck -check-prefix=GFX12 %s
-declare i32 @llvm.amdgcn.global.load.tr.i32.p1(ptr addrspace(1))
-declare <4 x i16> @llvm.amdgcn.global.load.tr.v4i16.p1(ptr addrspace(1))
-declare <4 x half> @llvm.amdgcn.global.load.tr.v4f16.p1(ptr addrspace(1))
-declare <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16.p1(ptr addrspace(1))
+declare i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1))
+declare <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1))
define amdgpu_kernel void @global_load_tr_b64(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
-; GFX12-SDAG-W64-LABEL: global_load_tr_b64:
-; GFX12-SDAG-W64: ; %bb.0: ; %entry
-; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-SDAG-W64-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-W64-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32
-; GFX12-SDAG-W64-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-W64-NEXT: global_store_b32 v0, v1, s[2:3]
-; GFX12-SDAG-W64-NEXT: s_nop 0
-; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-SDAG-W64-NEXT: s_endpgm
-;
-; GFX12-GISEL-W64-LABEL: global_load_tr_b64:
-; GFX12-GISEL-W64: ; %bb.0: ; %entry
-; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v0, 0
-; GFX12-GISEL-W64-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-W64-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32
-; GFX12-GISEL-W64-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-W64-NEXT: global_store_b32 v0, v1, s[2:3]
-; GFX12-GISEL-W64-NEXT: s_nop 0
-; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-GISEL-W64-NEXT: s_endpgm
+; GFX12-LABEL: global_load_tr_b64:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v0, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_tr_b64 v1, v0, s[0:1] offset:32
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b32 v0, v1, s[2:3]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
- %val = call i32 @llvm.amdgcn.global.load.tr.i32.p1(ptr addrspace(1) %gep)
+ %val = call i32 @llvm.amdgcn.global.load.tr.b64.i32.p1(ptr addrspace(1) %gep)
store i32 %val, ptr addrspace(1) %use
ret void
}
-define amdgpu_kernel void @global_load_tr_b128_i16(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
-; GFX12-SDAG-W64-LABEL: global_load_tr_b128_i16:
-; GFX12-SDAG-W64: ; %bb.0: ; %entry
-; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-SDAG-W64-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32
-; GFX12-SDAG-W64-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-SDAG-W64-NEXT: s_nop 0
-; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-SDAG-W64-NEXT: s_endpgm
-;
-; GFX12-GISEL-W64-LABEL: global_load_tr_b128_i16:
-; GFX12-GISEL-W64: ; %bb.0: ; %entry
-; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-GISEL-W64-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32
-; GFX12-GISEL-W64-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-GISEL-W64-NEXT: s_nop 0
-; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-GISEL-W64-NEXT: s_endpgm
+define amdgpu_kernel void @global_load_tr_b128(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
+; GFX12-LABEL: global_load_tr_b128:
+; GFX12: ; %bb.0: ; %entry
+; GFX12-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
+; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_wait_kmcnt 0x0
+; GFX12-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32
+; GFX12-NEXT: s_wait_loadcnt 0x0
+; GFX12-NEXT: global_store_b64 v2, v[0:1], s[2:3]
+; GFX12-NEXT: s_nop 0
+; GFX12-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX12-NEXT: s_endpgm
entry:
%gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
- %val = call <4 x i16> @llvm.amdgcn.global.load.tr.v4i16.p1(ptr addrspace(1) %gep)
+ %val = call <4 x i16> @llvm.amdgcn.global.load.tr.b128.v4i16.p1(ptr addrspace(1) %gep)
store <4 x i16> %val, ptr addrspace(1) %use
ret void
}
-
-define amdgpu_kernel void @global_load_tr_b128_half(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
-; GFX12-SDAG-W64-LABEL: global_load_tr_b128_half:
-; GFX12-SDAG-W64: ; %bb.0: ; %entry
-; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-SDAG-W64-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32
-; GFX12-SDAG-W64-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-SDAG-W64-NEXT: s_nop 0
-; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-SDAG-W64-NEXT: s_endpgm
-;
-; GFX12-GISEL-W64-LABEL: global_load_tr_b128_half:
-; GFX12-GISEL-W64: ; %bb.0: ; %entry
-; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-GISEL-W64-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32
-; GFX12-GISEL-W64-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-GISEL-W64-NEXT: s_nop 0
-; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-GISEL-W64-NEXT: s_endpgm
-entry:
- %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
- %val = call <4 x half> @llvm.amdgcn.global.load.tr.v4f16.p1(ptr addrspace(1) %gep)
- store <4 x half> %val, ptr addrspace(1) %use
- ret void
-}
-
-define amdgpu_kernel void @global_load_tr_b128_bfloat(ptr addrspace(1) %addr, ptr addrspace(1) %use) {
-; GFX12-SDAG-W64-LABEL: global_load_tr_b128_bfloat:
-; GFX12-SDAG-W64: ; %bb.0: ; %entry
-; GFX12-SDAG-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-SDAG-W64-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-SDAG-W64-NEXT: s_wait_kmcnt 0x0
-; GFX12-SDAG-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32
-; GFX12-SDAG-W64-NEXT: s_wait_loadcnt 0x0
-; GFX12-SDAG-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-SDAG-W64-NEXT: s_nop 0
-; GFX12-SDAG-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-SDAG-W64-NEXT: s_endpgm
-;
-; GFX12-GISEL-W64-LABEL: global_load_tr_b128_bfloat:
-; GFX12-GISEL-W64: ; %bb.0: ; %entry
-; GFX12-GISEL-W64-NEXT: s_load_b128 s[0:3], s[0:1], 0x24
-; GFX12-GISEL-W64-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-GISEL-W64-NEXT: s_wait_kmcnt 0x0
-; GFX12-GISEL-W64-NEXT: global_load_tr_b128 v[0:1], v2, s[0:1] offset:32
-; GFX12-GISEL-W64-NEXT: s_wait_loadcnt 0x0
-; GFX12-GISEL-W64-NEXT: global_store_b64 v2, v[0:1], s[2:3]
-; GFX12-GISEL-W64-NEXT: s_nop 0
-; GFX12-GISEL-W64-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX12-GISEL-W64-NEXT: s_endpgm
-entry:
- %gep = getelementptr i64, ptr addrspace(1) %addr, i32 4
- %val = call <4 x bfloat> @llvm.amdgcn.global.load.tr.v4bf16.p1(ptr addrspace(1) %gep)
- store <4 x bfloat> %val, ptr addrspace(1) %use
- ret void
-}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir
index 091b29c..e93595b 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.iglp.opt.single.2b.mir
@@ -4,6 +4,8 @@
--- |
define amdgpu_kernel void @single-wave-phase-2b(ptr addrspace(3) noalias %in0, ptr addrspace(3) noalias %in1, ptr addrspace(3) noalias %in2, ptr addrspace(3) noalias %in3, ptr addrspace(3) noalias %in4, ptr addrspace(3) noalias %in5, ptr addrspace(3) noalias %in6, ptr addrspace(3) noalias %in7, ptr addrspace(3) noalias %in8, ptr addrspace(3) noalias %in9, ptr addrspace(3) noalias %in10, ptr addrspace(3) noalias %in11, ptr addrspace(7) noalias %in12, ptr addrspace(7) noalias %in13, ptr addrspace(7) noalias %in14, ptr addrspace(7) noalias %in15, ptr addrspace(7) noalias %in16, ptr addrspace(7) noalias %in17, ptr addrspace(7) noalias %in18, ptr addrspace(7) noalias %in19, ptr addrspace(7) noalias %in20, ptr addrspace(7) noalias %in21, ptr addrspace(7) noalias %in22, ptr addrspace(7) noalias %in23, ptr addrspace(7) noalias %in24, ptr addrspace(7) noalias %in25, ptr addrspace(7) noalias %in26, ptr addrspace(7) noalias %in27, ptr addrspace(7) noalias %in28, ptr addrspace(7) noalias %in29) #0 { ret void }
+ attributes #0 = { nounwind "amdgpu-waves-per-eu"="1,1" "amdgpu-flat-work-group-size"="1,256" }
+
!0 = distinct !{!0}
!1 = !{!1, !0}
...
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll
index 1348315..7b1f55e 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.msaa.load.ll
@@ -22,18 +22,36 @@ main_body:
define amdgpu_ps <4 x float> @load_2dmsaa_both(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %fragid) {
; GFX11-LABEL: load_2dmsaa_both:
; GFX11: ; %bb.0: ; %main_body
-; GFX11-NEXT: image_msaa_load v[0:4], v[0:2], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x98,0x02,0x60,0xf0,0x00,0x00,0x60,0x00]
-; GFX11-NEXT: v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e]
+; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x08,0x05]
+; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x06,0x07]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; encoding: [0x42,0x02,0x87,0xbf]
+; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; encoding: [0x08,0x03,0x12,0x7e]
+; GFX11-NEXT: v_mov_b32_e32 v10, v8 ; encoding: [0x08,0x03,0x14,0x7e]
+; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; encoding: [0x08,0x03,0x16,0x7e]
+; GFX11-NEXT: v_mov_b32_e32 v12, v8 ; encoding: [0x08,0x03,0x18,0x7e]
+; GFX11-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 ; encoding: [0x08,0x01,0x10,0xca,0x09,0x01,0x00,0x00]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; encoding: [0x93,0x01,0x87,0xbf]
+; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 ; encoding: [0x0a,0x01,0x10,0xca,0x0b,0x01,0x02,0x02]
+; GFX11-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e]
+; GFX11-NEXT: image_msaa_load v[0:4], v[5:7], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x98,0x02,0x60,0xf0,0x05,0x00,0x60,0x00]
; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
-; GFX11-NEXT: global_store_b32 v5, v4, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x05,0x04,0x08,0x00]
+; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x08,0x04,0x08,0x00]
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: load_2dmsaa_both:
; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: image_msaa_load v[0:4], [v0, v1, v2], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x0e,0x20,0x86,0xe4,0x00,0x01,0x00,0x00,0x00,0x01,0x02,0x00]
-; GFX12-NEXT: v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e]
+; GFX12-NEXT: v_dual_mov_b32 v7, v0 :: v_dual_mov_b32 v8, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x08,0x07]
+; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v6, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x06,0x05]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; encoding: [0x22,0x01,0x87,0xbf]
+; GFX12-NEXT: v_dual_mov_b32 v9, v8 :: v_dual_mov_b32 v10, v8 ; encoding: [0x08,0x01,0x10,0xca,0x08,0x01,0x0a,0x09]
+; GFX12-NEXT: v_dual_mov_b32 v11, v8 :: v_dual_mov_b32 v12, v8 ; encoding: [0x08,0x01,0x10,0xca,0x08,0x01,0x0c,0x0b]
+; GFX12-NEXT: v_dual_mov_b32 v0, v8 :: v_dual_mov_b32 v1, v9 ; encoding: [0x08,0x01,0x10,0xca,0x09,0x01,0x00,0x00]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; encoding: [0x92,0x01,0x87,0xbf]
+; GFX12-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v3, v11 ; encoding: [0x0a,0x01,0x10,0xca,0x0b,0x01,0x02,0x02]
+; GFX12-NEXT: v_mov_b32_e32 v4, v12 ; encoding: [0x0c,0x03,0x08,0x7e]
+; GFX12-NEXT: image_msaa_load v[0:4], [v7, v6, v5], s[0:7] dmask:0x2 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; encoding: [0x0e,0x20,0x86,0xe4,0x00,0x01,0x00,0x00,0x07,0x06,0x05,0x00]
; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
-; GFX12-NEXT: global_store_b32 v5, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x05,0x00,0x00,0x00]
+; GFX12-NEXT: global_store_b32 v8, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x08,0x00,0x00,0x00]
; GFX12-NEXT: ; return to shader part epilog
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2dmsaa.v4f32i32.i32(i32 2, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 3, i32 0)
@@ -63,18 +81,37 @@ main_body:
define amdgpu_ps <4 x float> @load_2darraymsaa_tfe(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
; GFX11-LABEL: load_2darraymsaa_tfe:
; GFX11: ; %bb.0: ; %main_body
-; GFX11-NEXT: image_msaa_load v[0:4], v[0:3], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x9c,0x08,0x60,0xf0,0x00,0x00,0x20,0x00]
-; GFX11-NEXT: v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e]
+; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v8, v3 ; encoding: [0x80,0x00,0x10,0xca,0x03,0x01,0x08,0x09]
+; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x06,0x07]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; encoding: [0x42,0x02,0x87,0xbf]
+; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v10, v9 ; encoding: [0x00,0x01,0x10,0xca,0x09,0x01,0x0a,0x05]
+; GFX11-NEXT: v_mov_b32_e32 v11, v9 ; encoding: [0x09,0x03,0x16,0x7e]
+; GFX11-NEXT: v_mov_b32_e32 v12, v9 ; encoding: [0x09,0x03,0x18,0x7e]
+; GFX11-NEXT: v_mov_b32_e32 v13, v9 ; encoding: [0x09,0x03,0x1a,0x7e]
+; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 ; encoding: [0x09,0x01,0x10,0xca,0x0a,0x01,0x00,0x00]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) ; encoding: [0x93,0x01,0x87,0xbf]
+; GFX11-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 ; encoding: [0x0b,0x01,0x10,0xca,0x0c,0x01,0x02,0x02]
+; GFX11-NEXT: v_mov_b32_e32 v4, v13 ; encoding: [0x0d,0x03,0x08,0x7e]
+; GFX11-NEXT: image_msaa_load v[0:4], v[5:8], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x9c,0x08,0x60,0xf0,0x05,0x00,0x20,0x00]
; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
-; GFX11-NEXT: global_store_b32 v5, v4, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x05,0x04,0x08,0x00]
+; GFX11-NEXT: global_store_b32 v9, v4, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x09,0x04,0x08,0x00]
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: load_2darraymsaa_tfe:
; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: image_msaa_load v[0:4], [v0, v1, v2, v3], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x0f,0x20,0x06,0xe6,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x03]
-; GFX12-NEXT: v_mov_b32_e32 v5, 0 ; encoding: [0x80,0x02,0x0a,0x7e]
+; GFX12-NEXT: v_mov_b32_e32 v9, 0 ; encoding: [0x80,0x02,0x12,0x7e]
+; GFX12-NEXT: v_dual_mov_b32 v5, v3 :: v_dual_mov_b32 v6, v2 ; encoding: [0x03,0x01,0x10,0xca,0x02,0x01,0x06,0x05]
+; GFX12-NEXT: v_dual_mov_b32 v7, v1 :: v_dual_mov_b32 v8, v0 ; encoding: [0x01,0x01,0x10,0xca,0x00,0x01,0x08,0x07]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; encoding: [0x23,0x01,0x87,0xbf]
+; GFX12-NEXT: v_dual_mov_b32 v10, v9 :: v_dual_mov_b32 v11, v9 ; encoding: [0x09,0x01,0x10,0xca,0x09,0x01,0x0a,0x0a]
+; GFX12-NEXT: v_dual_mov_b32 v12, v9 :: v_dual_mov_b32 v13, v9 ; encoding: [0x09,0x01,0x10,0xca,0x09,0x01,0x0c,0x0c]
+; GFX12-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 ; encoding: [0x09,0x01,0x10,0xca,0x0a,0x01,0x00,0x00]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; encoding: [0x92,0x01,0x87,0xbf]
+; GFX12-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 ; encoding: [0x0b,0x01,0x10,0xca,0x0c,0x01,0x02,0x02]
+; GFX12-NEXT: v_mov_b32_e32 v4, v13 ; encoding: [0x0d,0x03,0x08,0x7e]
+; GFX12-NEXT: image_msaa_load v[0:4], [v8, v7, v6, v5], s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; encoding: [0x0f,0x20,0x06,0xe6,0x00,0x00,0x00,0x00,0x08,0x07,0x06,0x05]
; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
-; GFX12-NEXT: global_store_b32 v5, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x05,0x00,0x00,0x00]
+; GFX12-NEXT: global_store_b32 v9, v4, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x02,0x09,0x00,0x00,0x00]
; GFX12-NEXT: ; return to shader part epilog
main_body:
%v = call {<4 x float>,i32} @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f32i32.i32(i32 8, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
@@ -155,18 +192,31 @@ main_body:
define amdgpu_ps <4 x half> @load_2dmsaa_tfe_d16(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %fragid) {
; GFX11-LABEL: load_2dmsaa_tfe_d16:
; GFX11: ; %bb.0: ; %main_body
-; GFX11-NEXT: image_msaa_load v[0:2], v[0:2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ; encoding: [0x98,0x01,0x62,0xf0,0x00,0x00,0x20,0x00]
-; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e]
+; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v6, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x06,0x03]
+; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x04,0x05]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; encoding: [0x22,0x01,0x87,0xbf]
+; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; encoding: [0x06,0x03,0x0e,0x7e]
+; GFX11-NEXT: v_mov_b32_e32 v8, v6 ; encoding: [0x06,0x03,0x10,0x7e]
+; GFX11-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 ; encoding: [0x06,0x01,0x10,0xca,0x07,0x01,0x00,0x00]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; encoding: [0x02,0x00,0x87,0xbf]
+; GFX11-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e]
+; GFX11-NEXT: image_msaa_load v[0:2], v[3:5], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ; encoding: [0x98,0x01,0x62,0xf0,0x03,0x00,0x20,0x00]
; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
-; GFX11-NEXT: global_store_b32 v3, v2, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x03,0x02,0x08,0x00]
+; GFX11-NEXT: global_store_b32 v6, v2, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x06,0x02,0x08,0x00]
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: load_2dmsaa_tfe_d16:
; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: image_msaa_load v[0:2], [v0, v1, v2], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ; encoding: [0x2e,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x00]
-; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e]
+; GFX12-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x06,0x05]
+; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x04,0x03]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; encoding: [0x92,0x00,0x87,0xbf]
+; GFX12-NEXT: v_dual_mov_b32 v7, v6 :: v_dual_mov_b32 v8, v6 ; encoding: [0x06,0x01,0x10,0xca,0x06,0x01,0x08,0x07]
+; GFX12-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7 ; encoding: [0x06,0x01,0x10,0xca,0x07,0x01,0x00,0x00]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; encoding: [0x02,0x00,0x87,0xbf]
+; GFX12-NEXT: v_mov_b32_e32 v2, v8 ; encoding: [0x08,0x03,0x04,0x7e]
+; GFX12-NEXT: image_msaa_load v[0:2], [v5, v4, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA unorm tfe d16 ; encoding: [0x2e,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x05,0x04,0x03,0x00]
; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
-; GFX12-NEXT: global_store_b32 v3, v2, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x03,0x00,0x00,0x00]
+; GFX12-NEXT: global_store_b32 v6, v2, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x06,0x00,0x00,0x00]
; GFX12-NEXT: ; return to shader part epilog
main_body:
%v = call {<4 x half>,i32} @llvm.amdgcn.image.msaa.load.2dmsaa.v4f16i32.i32(i32 1, i32 %s, i32 %t, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
@@ -196,18 +246,31 @@ main_body:
define amdgpu_ps <4 x half> @load_2darraymsaa_tfe_d16(<8 x i32> inreg %rsrc, ptr addrspace(1) inreg %out, i32 %s, i32 %t, i32 %slice, i32 %fragid) {
; GFX11-LABEL: load_2darraymsaa_tfe_d16:
; GFX11: ; %bb.0: ; %main_body
-; GFX11-NEXT: image_msaa_load v[0:2], v[0:3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe d16 ; encoding: [0x9c,0x01,0x62,0xf0,0x00,0x00,0x20,0x00]
-; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e]
+; GFX11-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x06,0x06]
+; GFX11-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v5, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x04,0x04]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; encoding: [0x22,0x01,0x87,0xbf]
+; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; encoding: [0x07,0x03,0x10,0x7e]
+; GFX11-NEXT: v_mov_b32_e32 v9, v7 ; encoding: [0x07,0x03,0x12,0x7e]
+; GFX11-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; encoding: [0x07,0x01,0x10,0xca,0x08,0x01,0x00,0x00]
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; encoding: [0x02,0x00,0x87,0xbf]
+; GFX11-NEXT: v_mov_b32_e32 v2, v9 ; encoding: [0x09,0x03,0x04,0x7e]
+; GFX11-NEXT: image_msaa_load v[0:2], [v6, v5, v4, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe d16 ; encoding: [0x9d,0x01,0x62,0xf0,0x06,0x00,0x20,0x00,0x05,0x04,0x03,0x00]
; GFX11-NEXT: s_waitcnt vmcnt(0) ; encoding: [0xf7,0x03,0x89,0xbf]
-; GFX11-NEXT: global_store_b32 v3, v2, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x03,0x02,0x08,0x00]
+; GFX11-NEXT: global_store_b32 v7, v2, s[8:9] ; encoding: [0x00,0x00,0x6a,0xdc,0x07,0x02,0x08,0x00]
; GFX11-NEXT: ; return to shader part epilog
;
; GFX12-LABEL: load_2darraymsaa_tfe_d16:
; GFX12: ; %bb.0: ; %main_body
-; GFX12-NEXT: image_msaa_load v[0:2], [v0, v1, v2, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe d16 ; encoding: [0x2f,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x00,0x01,0x02,0x03]
-; GFX12-NEXT: v_mov_b32_e32 v3, 0 ; encoding: [0x80,0x02,0x06,0x7e]
+; GFX12-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_mov_b32 v7, 0 ; encoding: [0x00,0x01,0x10,0xca,0x80,0x00,0x06,0x06]
+; GFX12-NEXT: v_dual_mov_b32 v4, v2 :: v_dual_mov_b32 v5, v1 ; encoding: [0x02,0x01,0x10,0xca,0x01,0x01,0x04,0x04]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; encoding: [0x92,0x00,0x87,0xbf]
+; GFX12-NEXT: v_dual_mov_b32 v8, v7 :: v_dual_mov_b32 v9, v7 ; encoding: [0x07,0x01,0x10,0xca,0x07,0x01,0x08,0x08]
+; GFX12-NEXT: v_dual_mov_b32 v0, v7 :: v_dual_mov_b32 v1, v8 ; encoding: [0x07,0x01,0x10,0xca,0x08,0x01,0x00,0x00]
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_2) ; encoding: [0x02,0x00,0x87,0xbf]
+; GFX12-NEXT: v_mov_b32_e32 v2, v9 ; encoding: [0x09,0x03,0x04,0x7e]
+; GFX12-NEXT: image_msaa_load v[0:2], [v6, v5, v4, v3], s[0:7] dmask:0x1 dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe d16 ; encoding: [0x2f,0x20,0x46,0xe4,0x00,0x00,0x00,0x00,0x06,0x05,0x04,0x03]
; GFX12-NEXT: s_wait_loadcnt 0x0 ; encoding: [0x00,0x00,0xc0,0xbf]
-; GFX12-NEXT: global_store_b32 v3, v2, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x03,0x00,0x00,0x00]
+; GFX12-NEXT: global_store_b32 v7, v2, s[8:9] ; encoding: [0x08,0x80,0x06,0xee,0x00,0x00,0x00,0x01,0x07,0x00,0x00,0x00]
; GFX12-NEXT: ; return to shader part epilog
main_body:
%v = call {<4 x half>,i32} @llvm.amdgcn.image.msaa.load.2darraymsaa.v4f16i32.i32(i32 1, i32 %s, i32 %t, i32 %slice, i32 %fragid, <8 x i32> %rsrc, i32 1, i32 0)
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
index 429528e..e3dd036 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll
@@ -147,6 +147,34 @@ main_body:
ret half %res
}
+define amdgpu_ps half @v_interp_rtz_f16(float inreg %i, float inreg %j, i32 inreg %m0) #0 {
+; GCN-LABEL: v_interp_rtz_f16:
+; GCN: ; %bb.0: ; %main_body
+; GCN-NEXT: s_mov_b32 s3, exec_lo
+; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo
+; GCN-NEXT: s_mov_b32 m0, s2
+; GCN-NEXT: lds_param_load v1, attr0.x wait_vdst:15
+; GCN-NEXT: s_mov_b32 exec_lo, s3
+; GCN-NEXT: v_mov_b32_e32 v0, s0
+; GCN-NEXT: v_mov_b32_e32 v2, s1
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GCN-NEXT: v_interp_p10_rtz_f16_f32 v3, v1, v0, v1 wait_exp:0
+; GCN-NEXT: v_interp_p10_rtz_f16_f32 v0, v1, v0, v1 op_sel:[1,0,1,0] wait_exp:7
+; GCN-NEXT: v_interp_p2_rtz_f16_f32 v3, v1, v2, v3 wait_exp:7
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GCN-NEXT: v_interp_p2_rtz_f16_f32 v0, v1, v2, v0 op_sel:[1,0,0,0] wait_exp:7
+; GCN-NEXT: v_add_f16_e32 v0, v3, v0
+; GCN-NEXT: ; return to shader part epilog
+main_body:
+ %p0 = call float @llvm.amdgcn.lds.param.load(i32 0, i32 0, i32 %m0)
+ %l_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 0)
+ %l_p1 = call half @llvm.amdgcn.interp.p2.rtz.f16(float %p0, float %j, float %l_p0, i1 0)
+ %h_p0 = call float @llvm.amdgcn.interp.p10.rtz.f16(float %p0, float %i, float %p0, i1 1)
+ %h_p1 = call half @llvm.amdgcn.interp.p2.rtz.f16(float %p0, float %j, float %h_p0, i1 1)
+ %res = fadd half %l_p1, %h_p1
+ ret half %res
+}
+
define amdgpu_ps half @v_interp_f16_imm_params(float inreg %i, float inreg %j) #0 {
; GCN-LABEL: v_interp_f16_imm_params:
; GCN: ; %bb.0: ; %main_body
@@ -172,6 +200,8 @@ declare float @llvm.amdgcn.interp.inreg.p10(float, float, float) #0
declare float @llvm.amdgcn.interp.inreg.p2(float, float, float) #0
declare float @llvm.amdgcn.interp.inreg.p10.f16(float, float, float, i1) #0
declare half @llvm.amdgcn.interp.inreg.p2.f16(float, float, float, i1) #0
+declare float @llvm.amdgcn.interp.p10.rtz.f16(float, float, float, i1) #0
+declare half @llvm.amdgcn.interp.p2.rtz.f16(float, float, float, i1) #0
declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #0
declare void @llvm.amdgcn.exp.f16(i32, i32, float, float, float, float, i1, i1) #0
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll
new file mode 100644
index 0000000..fdcb177
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sched.group.barrier.gfx12.ll
@@ -0,0 +1,333 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -misched-cluster=0 < %s | FileCheck -check-prefix=GCN %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -misched-cluster=0 -amdgpu-igrouplp-exact-solver-max-branches=250000 < %s | FileCheck -check-prefix=EXACTCUTOFF %s
+
+declare <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16..i16(<8 x half>, <16 x half>, <8 x half>, i16)
+
+define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_cluster(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
+; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_lshlrev_b32_e32 v28, 4, v0
+; GCN-NEXT: v_mov_b32_e32 v48, 0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; GCN-NEXT: v_add_nc_u32_e32 v0, s0, v28
+; GCN-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28
+; GCN-NEXT: ds_load_b128 v[8:11], v0
+; GCN-NEXT: ds_load_b128 v[12:15], v0 offset:512
+; GCN-NEXT: ds_load_b128 v[16:19], v0 offset:1536
+; GCN-NEXT: ds_load_b128 v[20:23], v0 offset:3072
+; GCN-NEXT: ds_load_b128 v[24:27], v0 offset:5120
+; GCN-NEXT: ds_load_b128 v[4:7], v0 offset:11280
+; GCN-NEXT: ds_load_b128 v[0:3], v0 offset:11264
+; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(7) SyncID(0)
+; GCN-NEXT: s_wait_dscnt 0x6
+; GCN-NEXT: v_mov_b32_e32 v31, v11
+; GCN-NEXT: s_wait_dscnt 0x5
+; GCN-NEXT: v_mov_b32_e32 v35, v15
+; GCN-NEXT: s_wait_dscnt 0x4
+; GCN-NEXT: v_mov_b32_e32 v39, v19
+; GCN-NEXT: s_wait_dscnt 0x3
+; GCN-NEXT: v_mov_b32_e32 v43, v23
+; GCN-NEXT: s_wait_dscnt 0x2
+; GCN-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10
+; GCN-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8
+; GCN-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13
+; GCN-NEXT: v_mov_b32_e32 v32, v12
+; GCN-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17
+; GCN-NEXT: v_mov_b32_e32 v36, v16
+; GCN-NEXT: v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21
+; GCN-NEXT: v_mov_b32_e32 v40, v20
+; GCN-NEXT: v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25
+; GCN-NEXT: v_mov_b32_e32 v44, v24
+; GCN-NEXT: s_wait_dscnt 0x0
+; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48
+; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48
+; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[36:39], v[16:19], v[0:7], v48
+; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[40:43], v[20:23], v[0:7], v48
+; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[44:47], v[24:27], v[0:7], v48
+; GCN-NEXT: ds_store_b128 v49, v[28:31]
+; GCN-NEXT: ds_store_b128 v50, v[32:35] offset:512
+; GCN-NEXT: ds_store_b128 v50, v[36:39] offset:1024
+; GCN-NEXT: ds_store_b128 v50, v[40:43] offset:1536
+; GCN-NEXT: ds_store_b128 v50, v[44:47] offset:2048
+; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0)
+; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(5) SyncID(0)
+; GCN-NEXT: s_endpgm
+;
+; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_cluster:
+; EXACTCUTOFF: ; %bb.0: ; %entry
+; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; EXACTCUTOFF-NEXT: v_lshlrev_b32_e32 v28, 4, v0
+; EXACTCUTOFF-NEXT: v_mov_b32_e32 v48, 0
+; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0
+; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_2)
+; EXACTCUTOFF-NEXT: v_add_nc_u32_e32 v0, s0, v28
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v50, s1 :: v_dual_add_nc_u32 v49, s1, v28
+; EXACTCUTOFF-NEXT: ds_load_b128 v[8:11], v0
+; EXACTCUTOFF-NEXT: ds_load_b128 v[12:15], v0 offset:512
+; EXACTCUTOFF-NEXT: ds_load_b128 v[16:19], v0 offset:1536
+; EXACTCUTOFF-NEXT: ds_load_b128 v[20:23], v0 offset:3072
+; EXACTCUTOFF-NEXT: ds_load_b128 v[24:27], v0 offset:5120
+; EXACTCUTOFF-NEXT: ds_load_b128 v[4:7], v0 offset:11280
+; EXACTCUTOFF-NEXT: ds_load_b128 v[0:3], v0 offset:11264
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(7) SyncID(0)
+; EXACTCUTOFF-NEXT: s_wait_dscnt 0x6
+; EXACTCUTOFF-NEXT: v_mov_b32_e32 v31, v11
+; EXACTCUTOFF-NEXT: s_wait_dscnt 0x5
+; EXACTCUTOFF-NEXT: v_mov_b32_e32 v35, v15
+; EXACTCUTOFF-NEXT: s_wait_dscnt 0x4
+; EXACTCUTOFF-NEXT: v_mov_b32_e32 v39, v19
+; EXACTCUTOFF-NEXT: s_wait_dscnt 0x3
+; EXACTCUTOFF-NEXT: v_mov_b32_e32 v43, v23
+; EXACTCUTOFF-NEXT: s_wait_dscnt 0x2
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v47, v27 :: v_dual_mov_b32 v30, v10
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v29, v9 :: v_dual_mov_b32 v28, v8
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13
+; EXACTCUTOFF-NEXT: v_mov_b32_e32 v32, v12
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v38, v18 :: v_dual_mov_b32 v37, v17
+; EXACTCUTOFF-NEXT: v_mov_b32_e32 v36, v16
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v42, v22 :: v_dual_mov_b32 v41, v21
+; EXACTCUTOFF-NEXT: v_mov_b32_e32 v40, v20
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v46, v26 :: v_dual_mov_b32 v45, v25
+; EXACTCUTOFF-NEXT: v_mov_b32_e32 v44, v24
+; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0
+; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[28:31], v[8:11], v[0:7], v48
+; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[32:35], v[12:15], v[0:7], v48
+; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[36:39], v[16:19], v[0:7], v48
+; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[40:43], v[20:23], v[0:7], v48
+; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[44:47], v[24:27], v[0:7], v48
+; EXACTCUTOFF-NEXT: ds_store_b128 v49, v[28:31]
+; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[32:35] offset:512
+; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[36:39] offset:1024
+; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[40:43] offset:1536
+; EXACTCUTOFF-NEXT: ds_store_b128 v50, v[44:47] offset:2048
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(5) SyncID(0)
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(5) SyncID(0)
+; EXACTCUTOFF-NEXT: s_endpgm
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %load.0.addr = getelementptr <8 x half>, ptr addrspace(3) %in, i32 %idx
+ %load.0 = load <8 x half>, ptr addrspace(3) %load.0.addr
+ %load.1.addr = getelementptr <8 x half>, ptr addrspace(3) %load.0.addr, i32 32
+ %load.1 = load <8 x half>, ptr addrspace(3) %load.1.addr
+ %load.2.addr = getelementptr <8 x half>, ptr addrspace(3) %load.1.addr, i32 64
+ %load.2 = load <8 x half>, ptr addrspace(3) %load.2.addr
+ %load.3.addr = getelementptr <8 x half>, ptr addrspace(3) %load.2.addr, i32 96
+ %load.3 = load <8 x half>, ptr addrspace(3) %load.3.addr
+ %load.4.addr = getelementptr <8 x half>, ptr addrspace(3) %load.3.addr, i32 128
+ %load.4 = load <8 x half>, ptr addrspace(3) %load.4.addr
+ %load.b.addr = getelementptr <16 x half>, ptr addrspace(3) %load.4.addr, i32 192
+ %load.b = load <16 x half>, ptr addrspace(3) %load.b.addr
+ %mai.0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.0, <16 x half> %load.b, <8 x half> %load.0, i1 0)
+ %mai.1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.1, <16 x half> %load.b, <8 x half> %load.1, i1 0)
+ %mai.2 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.2, <16 x half> %load.b, <8 x half> %load.2, i1 0)
+ %mai.3 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.3, <16 x half> %load.b, <8 x half> %load.3, i1 0)
+ %mai.4 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.4, <16 x half> %load.b, <8 x half> %load.4, i1 0)
+ %store.0.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 %idx
+ store <8 x half> %mai.0, ptr addrspace(3) %store.0.addr
+ %store.1.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 32
+ store <8 x half> %mai.1, ptr addrspace(3) %store.1.addr
+ %store.2.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 64
+ store <8 x half> %mai.2, ptr addrspace(3) %store.2.addr
+ %store.3.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 96
+ store <8 x half> %mai.3, ptr addrspace(3) %store.3.addr
+ %store.4.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 128
+ store <8 x half> %mai.4, ptr addrspace(3) %store.4.addr
+ ; 7 DS read
+ call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 7, i32 0)
+ ; 5 SWMMAC
+ call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 5, i32 0)
+ ; 5 DS write
+ call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 5, i32 0)
+ ret void
+}
+
+define amdgpu_kernel void @test_sched_group_barrier_pipeline_SWMMAC_interleaved(ptr addrspace(3) noalias %in, ptr addrspace(3) noalias %out) #0 {
+; GCN-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved:
+; GCN: ; %bb.0: ; %entry
+; GCN-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; GCN-NEXT: v_mov_b32_e32 v18, 0
+; GCN-NEXT: s_wait_kmcnt 0x0
+; GCN-NEXT: v_lshl_add_u32 v17, v0, 5, s0
+; GCN-NEXT: v_lshl_add_u32 v0, v0, 4, s1
+; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:1024
+; GCN-NEXT: ds_load_b128 v[1:4], v17
+; GCN-NEXT: ds_load_b128 v[5:8], v17 offset:16
+; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0)
+; GCN-NEXT: s_wait_dscnt 0x2
+; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; GCN-NEXT: s_wait_dscnt 0x0
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-NEXT: ds_store_b128 v0, v[13:16]
+; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:2560
+; GCN-NEXT: v_mov_b32_e32 v0, s1
+; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; GCN-NEXT: s_wait_dscnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:512
+; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:4608
+; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; GCN-NEXT: s_wait_dscnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:1024
+; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:7168
+; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; GCN-NEXT: s_wait_dscnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:1536
+; GCN-NEXT: ds_load_b128 v[9:12], v17 offset:10240
+; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; GCN-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; GCN-NEXT: s_wait_dscnt 0x0
+; GCN-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; GCN-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GCN-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; GCN-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; GCN-NEXT: ds_store_b128 v0, v[13:16] offset:2048
+; GCN-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; GCN-NEXT: s_endpgm
+;
+; EXACTCUTOFF-LABEL: test_sched_group_barrier_pipeline_SWMMAC_interleaved:
+; EXACTCUTOFF: ; %bb.0: ; %entry
+; EXACTCUTOFF-NEXT: s_load_b64 s[0:1], s[0:1], 0x24
+; EXACTCUTOFF-NEXT: v_mov_b32_e32 v18, 0
+; EXACTCUTOFF-NEXT: s_wait_kmcnt 0x0
+; EXACTCUTOFF-NEXT: v_lshl_add_u32 v17, v0, 5, s0
+; EXACTCUTOFF-NEXT: v_lshl_add_u32 v0, v0, 4, s1
+; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:1024
+; EXACTCUTOFF-NEXT: ds_load_b128 v[1:4], v17
+; EXACTCUTOFF-NEXT: ds_load_b128 v[5:8], v17 offset:16
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(3) SyncID(0)
+; EXACTCUTOFF-NEXT: s_wait_dscnt 0x2
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0
+; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16]
+; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:2560
+; EXACTCUTOFF-NEXT: v_mov_b32_e32 v0, s1
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:512
+; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:4608
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:1024
+; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:7168
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:1536
+; EXACTCUTOFF-NEXT: ds_load_b128 v[9:12], v17 offset:10240
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000100) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT: s_wait_dscnt 0x0
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v16, v12 :: v_dual_mov_b32 v15, v11
+; EXACTCUTOFF-NEXT: v_dual_mov_b32 v14, v10 :: v_dual_mov_b32 v13, v9
+; EXACTCUTOFF-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; EXACTCUTOFF-NEXT: v_swmmac_f16_16x16x32_f16 v[13:16], v[9:12], v[1:8], v18
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000008) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT: ds_store_b128 v0, v[13:16] offset:2048
+; EXACTCUTOFF-NEXT: ; sched_group_barrier mask(0x00000200) size(1) SyncID(0)
+; EXACTCUTOFF-NEXT: s_endpgm
+entry:
+ %idx = call i32 @llvm.amdgcn.workitem.id.x()
+ %load.b.addr = getelementptr <16 x half>, ptr addrspace(3) %in, i32 %idx
+ %load.b = load <16 x half>, ptr addrspace(3) %load.b.addr
+ %load.0.addr = getelementptr <8 x half>, ptr addrspace(3) %load.b.addr, i32 64
+ %load.0 = load <8 x half>, ptr addrspace(3) %load.0.addr
+ %load.1.addr = getelementptr <8 x half>, ptr addrspace(3) %load.0.addr, i32 96
+ %load.1 = load <8 x half>, ptr addrspace(3) %load.1.addr
+ %load.2.addr = getelementptr <8 x half>, ptr addrspace(3) %load.1.addr, i32 128
+ %load.2 = load <8 x half>, ptr addrspace(3) %load.2.addr
+ %load.3.addr = getelementptr <8 x half>, ptr addrspace(3) %load.2.addr, i32 160
+ %load.3 = load <8 x half>, ptr addrspace(3) %load.3.addr
+ %load.4.addr = getelementptr <8 x half>, ptr addrspace(3) %load.3.addr, i32 192
+ %load.4 = load <8 x half>, ptr addrspace(3) %load.4.addr
+ %mai.0 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.0, <16 x half> %load.b, <8 x half> %load.0, i1 0)
+ %mai.1 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.1, <16 x half> %load.b, <8 x half> %load.1, i1 0)
+ %mai.2 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.2, <16 x half> %load.b, <8 x half> %load.2, i1 0)
+ %mai.3 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.3, <16 x half> %load.b, <8 x half> %load.3, i1 0)
+ %mai.4 = call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x32.f16.v8f16.v8f16.v16f16.i16(<8 x half> %load.4, <16 x half> %load.b, <8 x half> %load.4, i1 0)
+ %store.0.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 %idx
+ store <8 x half> %mai.0, ptr addrspace(3) %store.0.addr
+ %store.1.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 32
+ store <8 x half> %mai.1, ptr addrspace(3) %store.1.addr
+ %store.2.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 64
+ store <8 x half> %mai.2, ptr addrspace(3) %store.2.addr
+ %store.3.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 96
+ store <8 x half> %mai.3, ptr addrspace(3) %store.3.addr
+ %store.4.addr = getelementptr <8 x half>, ptr addrspace(3) %out, i32 128
+ store <8 x half> %mai.4, ptr addrspace(3) %store.4.addr
+ ; 3 DS read
+ call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 3, i32 0)
+ ; 1 SWMMAC
+ call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
+ ; 1 DS write
+ call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
+ ; 1 DS read
+ call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
+ ; 1 SWMMAC
+ call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
+ ; 1 DS write
+ call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
+ ; 1 DS read
+ call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
+ ; 1 SWMMAC
+ call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
+ ; 1 DS write
+ call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
+ ; 1 DS read
+ call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
+ ; 1 SWMMAC
+ call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
+ ; 1 DS write
+ call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
+ ; 1 DS read
+ call void @llvm.amdgcn.sched.group.barrier(i32 256, i32 1, i32 0)
+ ; 1 SWMMAC
+ call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
+ ; 1 DS write
+ call void @llvm.amdgcn.sched.group.barrier(i32 512, i32 1, i32 0)
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll
index 00be32b..ba3d306 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.buffer.load.format.ll
@@ -2,6 +2,7 @@
;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefixes=GFX6 %s
;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefixes=GFX8PLUS %s
;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=GFX11 %s
+;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-enable-prt-strict-null -verify-machineinstrs | FileCheck --check-prefixes=NOPRT %s
;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck --check-prefixes=GFX12,GFX12-SDAG %s
;RUN: llc < %s -global-isel -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs | FileCheck --check-prefixes=GFX12,GFX12-GISEL %s
@@ -34,6 +35,16 @@ define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(<4 x i32>
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
+; NOPRT-LABEL: buffer_load:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: v_mov_b32_e32 v8, 0
+; NOPRT-NEXT: s_clause 0x2
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v8, s[0:3], 0 idxen
+; NOPRT-NEXT: buffer_load_format_xyzw v[4:7], v8, s[0:3], 0 idxen glc
+; NOPRT-NEXT: buffer_load_format_xyzw v[8:11], v8, s[0:3], 0 idxen slc
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
+;
; GFX12-LABEL: buffer_load:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: v_mov_b32_e32 v8, 0
@@ -75,6 +86,13 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs(<4 x i32> inreg) {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
+; NOPRT-LABEL: buffer_load_immoffs:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: v_mov_b32_e32 v0, 0
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:42
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
+;
; GFX12-LABEL: buffer_load_immoffs:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: v_mov_b32_e32 v0, 0
@@ -146,6 +164,25 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs_large(<4 x i32> inreg) {
; GFX11-NEXT: v_add_f32_e32 v2, v10, v2
; GFX11-NEXT: ; return to shader part epilog
;
+; NOPRT-LABEL: buffer_load_immoffs_large:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: v_mov_b32_e32 v8, 0
+; NOPRT-NEXT: s_movk_i32 s4, 0x7ffc
+; NOPRT-NEXT: s_clause 0x1
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v8, s[0:3], 60 idxen offset:4092
+; NOPRT-NEXT: buffer_load_format_xyzw v[4:7], v8, s[0:3], s4 idxen offset:4092
+; NOPRT-NEXT: s_mov_b32 s4, 0x8ffc
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: v_add_f32_e32 v1, v1, v5
+; NOPRT-NEXT: buffer_load_format_xyzw v[8:11], v8, s[0:3], s4 idxen offset:4
+; NOPRT-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v3, v3, v7
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v1, v9, v1
+; NOPRT-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; NOPRT-NEXT: v_dual_add_f32 v0, v8, v0 :: v_dual_add_f32 v3, v11, v3
+; NOPRT-NEXT: v_add_f32_e32 v2, v10, v2
+; NOPRT-NEXT: ; return to shader part epilog
+;
; GFX12-LABEL: buffer_load_immoffs_large:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: v_mov_b32_e32 v8, 0
@@ -196,6 +233,13 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(<4 x i32> inreg) {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
+; NOPRT-LABEL: buffer_load_voffset_large_12bit:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: v_mov_b32_e32 v0, 0
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:4092
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
+;
; GFX12-LABEL: buffer_load_voffset_large_12bit:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: v_mov_b32_e32 v0, 0
@@ -235,6 +279,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_13bit(<4 x i32> inreg) {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
+; NOPRT-LABEL: buffer_load_voffset_large_13bit:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: s_mov_b32 s4, 0
+; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
+;
; GFX12-LABEL: buffer_load_voffset_large_13bit:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: v_mov_b32_e32 v0, 0
@@ -274,6 +327,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_16bit(<4 x i32> inreg) {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
+; NOPRT-LABEL: buffer_load_voffset_large_16bit:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: s_mov_b32 s4, 0
+; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT: v_dual_mov_b32 v1, 0xf000 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
+;
; GFX12-LABEL: buffer_load_voffset_large_16bit:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: v_mov_b32_e32 v0, 0
@@ -313,6 +375,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_23bit(<4 x i32> inreg) {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
+; NOPRT-LABEL: buffer_load_voffset_large_23bit:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: s_mov_b32 s4, 0
+; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT: v_dual_mov_b32 v1, 0x7ff000 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
+;
; GFX12-LABEL: buffer_load_voffset_large_23bit:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: v_mov_b32_e32 v0, 0
@@ -352,6 +423,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_24bit(<4 x i32> inreg) {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
+; NOPRT-LABEL: buffer_load_voffset_large_24bit:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: s_mov_b32 s4, 0
+; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT: v_dual_mov_b32 v1, 0xfff000 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
+;
; GFX12-SDAG-LABEL: buffer_load_voffset_large_24bit:
; GFX12-SDAG: ; %bb.0: ; %main_body
; GFX12-SDAG-NEXT: v_dual_mov_b32 v1, 0x800000 :: v_dual_mov_b32 v0, 0
@@ -389,6 +469,12 @@ define amdgpu_ps <4 x float> @buffer_load_idx(<4 x i32> inreg, i32) {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
+; NOPRT-LABEL: buffer_load_idx:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
+;
; GFX12-LABEL: buffer_load_idx:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], null idxen
@@ -427,6 +513,15 @@ define amdgpu_ps <4 x float> @buffer_load_ofs(<4 x i32> inreg, i32) {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
+; NOPRT-LABEL: buffer_load_ofs:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: s_mov_b32 s4, 0
+; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
+;
; GFX12-LABEL: buffer_load_ofs:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0
@@ -466,6 +561,15 @@ define amdgpu_ps <4 x float> @buffer_load_ofs_imm(<4 x i32> inreg, i32) {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
+; NOPRT-LABEL: buffer_load_ofs_imm:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: s_mov_b32 s4, 0
+; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
+;
; GFX12-LABEL: buffer_load_ofs_imm:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, 0
@@ -497,6 +601,12 @@ define amdgpu_ps <4 x float> @buffer_load_both(<4 x i32> inreg, i32, i32) {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
+; NOPRT-LABEL: buffer_load_both:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
+;
; GFX12-LABEL: buffer_load_both:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], null idxen offen
@@ -529,6 +639,13 @@ define amdgpu_ps <4 x float> @buffer_load_both_reversed(<4 x i32> inreg, i32, i3
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
+; NOPRT-LABEL: buffer_load_both_reversed:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: v_mov_b32_e32 v2, v0
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
+;
; GFX12-LABEL: buffer_load_both_reversed:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: v_mov_b32_e32 v2, v0
@@ -562,6 +679,13 @@ define amdgpu_ps float @buffer_load_x(<4 x i32> inreg %rsrc) {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
+; NOPRT-LABEL: buffer_load_x:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: v_mov_b32_e32 v0, 0
+; NOPRT-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
+;
; GFX12-LABEL: buffer_load_x:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: v_mov_b32_e32 v0, 0
@@ -595,6 +719,13 @@ define amdgpu_ps float @buffer_load_x_i32(<4 x i32> inreg %rsrc) {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
+; NOPRT-LABEL: buffer_load_x_i32:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: v_mov_b32_e32 v0, 0
+; NOPRT-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
+;
; GFX12-LABEL: buffer_load_x_i32:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: v_mov_b32_e32 v0, 0
@@ -629,6 +760,13 @@ define amdgpu_ps <2 x float> @buffer_load_xy(<4 x i32> inreg %rsrc) {
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
;
+; NOPRT-LABEL: buffer_load_xy:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: v_mov_b32_e32 v0, 0
+; NOPRT-NEXT: buffer_load_format_xy v[0:1], v0, s[0:3], 0 idxen
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
+;
; GFX12-LABEL: buffer_load_xy:
; GFX12: ; %bb.0: ; %main_body
; GFX12-NEXT: v_mov_b32_e32 v0, 0
@@ -644,7 +782,12 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
; GFX6-LABEL: buffer_load_v4i32_tfe:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v2, 0
-; GFX6-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
+; GFX6-NEXT: v_mov_b32_e32 v7, 2
+; GFX6-NEXT: v_mov_b32_e32 v3, v2
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v6, v2
+; GFX6-NEXT: buffer_load_format_xyzw v[2:6], v7, s[0:3], 0 idxen tfe
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
; GFX6-NEXT: s_mov_b32 s0, s2
@@ -658,7 +801,12 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
; GFX8PLUS-LABEL: buffer_load_v4i32_tfe:
; GFX8PLUS: ; %bb.0:
; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0
-; GFX8PLUS-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
+; GFX8PLUS-NEXT: v_mov_b32_e32 v7, 2
+; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2
+; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2
+; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2
+; GFX8PLUS-NEXT: v_mov_b32_e32 v6, v2
+; GFX8PLUS-NEXT: buffer_load_format_xyzw v[2:6], v7, s[0:3], 0 idxen tfe
; GFX8PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX8PLUS-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
; GFX8PLUS-NEXT: v_mov_b32_e32 v0, v6
@@ -667,22 +815,40 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
;
; GFX11-LABEL: buffer_load_v4i32_tfe:
; GFX11: ; %bb.0:
-; GFX11-NEXT: v_mov_b32_e32 v2, 0
-; GFX11-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
+; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v7, 2
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-NEXT: v_mov_b32_e32 v5, v2
+; GFX11-NEXT: v_mov_b32_e32 v6, v2
+; GFX11-NEXT: buffer_load_format_xyzw v[2:6], v7, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-NEXT: v_mov_b32_e32 v0, v6
; GFX11-NEXT: ; return to shader part epilog
;
+; NOPRT-LABEL: buffer_load_v4i32_tfe:
+; NOPRT: ; %bb.0:
+; NOPRT-NEXT: v_mov_b32_e32 v2, 2
+; NOPRT-NEXT: v_mov_b32_e32 v6, 0
+; NOPRT-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: global_store_b128 v[0:1], v[2:5], off
+; NOPRT-NEXT: v_mov_b32_e32 v0, v6
+; NOPRT-NEXT: ; return to shader part epilog
+;
; GFX12-LABEL: buffer_load_v4i32_tfe:
; GFX12: ; %bb.0:
-; GFX12-NEXT: v_mov_b32_e32 v2, 0
-; GFX12-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], null idxen tfe
+; GFX12-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v7, 2
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v6, v2
+; GFX12-NEXT: buffer_load_format_xyzw v[2:6], v7, s[0:3], null idxen tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX12-NEXT: v_mov_b32_e32 v0, v6
; GFX12-NEXT: ; return to shader part epilog
- %load = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(<4 x i32> %rsrc, i32 0, i32 0, i32 0, i32 0)
+ %load = call { <4 x i32>, i32 } @llvm.amdgcn.struct.buffer.load.format.sl_v4i32i32s(<4 x i32> %rsrc, i32 2, i32 0, i32 0, i32 0)
%data = extractvalue { <4 x i32>, i32 } %load, 0
store <4 x i32> %data, ptr addrspace(1) %out
%status = extractvalue { <4 x i32>, i32 } %load, 1
@@ -694,6 +860,10 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
; GFX6-LABEL: buffer_load_v4f32_tfe:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-NEXT: v_mov_b32_e32 v3, v2
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v6, v2
; GFX6-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
@@ -708,6 +878,10 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
; GFX8PLUS-LABEL: buffer_load_v4f32_tfe:
; GFX8PLUS: ; %bb.0:
; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2
+; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2
+; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2
+; GFX8PLUS-NEXT: v_mov_b32_e32 v6, v2
; GFX8PLUS-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
; GFX8PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX8PLUS-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
@@ -718,15 +892,32 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
; GFX11-LABEL: buffer_load_v4f32_tfe:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-NEXT: v_mov_b32_e32 v5, v2
+; GFX11-NEXT: v_mov_b32_e32 v6, v2
; GFX11-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-NEXT: v_mov_b32_e32 v0, v6
; GFX11-NEXT: ; return to shader part epilog
;
+; NOPRT-LABEL: buffer_load_v4f32_tfe:
+; NOPRT: ; %bb.0:
+; NOPRT-NEXT: v_mov_b32_e32 v6, 0
+; NOPRT-NEXT: buffer_load_format_xyzw v[2:6], v6, s[0:3], 0 idxen tfe
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: global_store_b128 v[0:1], v[2:5], off
+; NOPRT-NEXT: v_mov_b32_e32 v0, v6
+; NOPRT-NEXT: ; return to shader part epilog
+;
; GFX12-LABEL: buffer_load_v4f32_tfe:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v6, v2
; GFX12-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], null idxen tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b128 v[0:1], v[2:5], off
@@ -744,6 +935,9 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
; GFX6-LABEL: buffer_load_v3i32_tfe:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-NEXT: v_mov_b32_e32 v3, v2
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v2
; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
@@ -759,6 +953,9 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
; GFX8PLUS-LABEL: buffer_load_v3i32_tfe:
; GFX8PLUS: ; %bb.0:
; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2
+; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2
+; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2
; GFX8PLUS-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
; GFX8PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX8PLUS-NEXT: flat_store_dwordx3 v[0:1], v[2:4]
@@ -769,15 +966,31 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
; GFX11-LABEL: buffer_load_v3i32_tfe:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-NEXT: v_mov_b32_e32 v5, v2
; GFX11-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off
; GFX11-NEXT: v_mov_b32_e32 v0, v5
; GFX11-NEXT: ; return to shader part epilog
;
+; NOPRT-LABEL: buffer_load_v3i32_tfe:
+; NOPRT: ; %bb.0:
+; NOPRT-NEXT: v_mov_b32_e32 v5, 0
+; NOPRT-NEXT: buffer_load_format_xyz v[2:5], v5, s[0:3], 0 idxen tfe
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: global_store_b96 v[0:1], v[2:4], off
+; NOPRT-NEXT: v_mov_b32_e32 v0, v5
+; NOPRT-NEXT: ; return to shader part epilog
+;
; GFX12-LABEL: buffer_load_v3i32_tfe:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: v_mov_b32_e32 v5, v2
; GFX12-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], null idxen tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off
@@ -795,6 +1008,9 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
; GFX6-LABEL: buffer_load_v3f32_tfe:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-NEXT: v_mov_b32_e32 v3, v2
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v2
; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
@@ -810,6 +1026,9 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
; GFX8PLUS-LABEL: buffer_load_v3f32_tfe:
; GFX8PLUS: ; %bb.0:
; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2
+; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2
+; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2
; GFX8PLUS-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
; GFX8PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX8PLUS-NEXT: flat_store_dwordx3 v[0:1], v[2:4]
@@ -820,15 +1039,31 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
; GFX11-LABEL: buffer_load_v3f32_tfe:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-NEXT: v_mov_b32_e32 v5, v2
; GFX11-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off
; GFX11-NEXT: v_mov_b32_e32 v0, v5
; GFX11-NEXT: ; return to shader part epilog
;
+; NOPRT-LABEL: buffer_load_v3f32_tfe:
+; NOPRT: ; %bb.0:
+; NOPRT-NEXT: v_mov_b32_e32 v5, 0
+; NOPRT-NEXT: buffer_load_format_xyz v[2:5], v5, s[0:3], 0 idxen tfe
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: global_store_b96 v[0:1], v[2:4], off
+; NOPRT-NEXT: v_mov_b32_e32 v0, v5
+; NOPRT-NEXT: ; return to shader part epilog
+;
; GFX12-LABEL: buffer_load_v3f32_tfe:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2
+; GFX12-NEXT: v_mov_b32_e32 v5, v2
; GFX12-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], null idxen tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b96 v[0:1], v[2:4], off
@@ -846,6 +1081,9 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
; GFX6-LABEL: buffer_load_v2i32_tfe:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-NEXT: v_mov_b32_e32 v3, v2
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v2
; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
@@ -860,6 +1098,8 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
; GFX8PLUS-LABEL: buffer_load_v2i32_tfe:
; GFX8PLUS: ; %bb.0:
; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2
+; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2
; GFX8PLUS-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe
; GFX8PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX8PLUS-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -870,15 +1110,29 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
; GFX11-LABEL: buffer_load_v2i32_tfe:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: v_mov_b32_e32 v4, v2
; GFX11-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX11-NEXT: v_mov_b32_e32 v0, v4
; GFX11-NEXT: ; return to shader part epilog
;
+; NOPRT-LABEL: buffer_load_v2i32_tfe:
+; NOPRT: ; %bb.0:
+; NOPRT-NEXT: v_mov_b32_e32 v4, 0
+; NOPRT-NEXT: buffer_load_format_xy v[2:4], v4, s[0:3], 0 idxen tfe
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: global_store_b64 v[0:1], v[2:3], off
+; NOPRT-NEXT: v_mov_b32_e32 v0, v4
+; NOPRT-NEXT: ; return to shader part epilog
+;
; GFX12-LABEL: buffer_load_v2i32_tfe:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2
; GFX12-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], null idxen tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -896,6 +1150,9 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
; GFX6-LABEL: buffer_load_v2f32_tfe:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-NEXT: v_mov_b32_e32 v3, v2
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v2
; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
@@ -910,6 +1167,8 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
; GFX8PLUS-LABEL: buffer_load_v2f32_tfe:
; GFX8PLUS: ; %bb.0:
; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2
+; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2
; GFX8PLUS-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe
; GFX8PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX8PLUS-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -920,15 +1179,29 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(<4 x i32> inreg %rsrc, ptr addrspa
; GFX11-LABEL: buffer_load_v2f32_tfe:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: v_mov_b32_e32 v4, v2
; GFX11-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX11-NEXT: v_mov_b32_e32 v0, v4
; GFX11-NEXT: ; return to shader part epilog
;
+; NOPRT-LABEL: buffer_load_v2f32_tfe:
+; NOPRT: ; %bb.0:
+; NOPRT-NEXT: v_mov_b32_e32 v4, 0
+; NOPRT-NEXT: buffer_load_format_xy v[2:4], v4, s[0:3], 0 idxen tfe
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: global_store_b64 v[0:1], v[2:3], off
+; NOPRT-NEXT: v_mov_b32_e32 v0, v4
+; NOPRT-NEXT: ; return to shader part epilog
+;
; GFX12-LABEL: buffer_load_v2f32_tfe:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v4, v2
; GFX12-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], null idxen tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b64 v[0:1], v[2:3], off
@@ -946,6 +1219,7 @@ define amdgpu_cs float @buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace
; GFX6-LABEL: buffer_load_i32_tfe:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-NEXT: v_mov_b32_e32 v3, v2
; GFX6-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
@@ -960,6 +1234,7 @@ define amdgpu_cs float @buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace
; GFX8PLUS-LABEL: buffer_load_i32_tfe:
; GFX8PLUS: ; %bb.0:
; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2
; GFX8PLUS-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe
; GFX8PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX8PLUS-NEXT: flat_store_dword v[0:1], v2
@@ -970,15 +1245,28 @@ define amdgpu_cs float @buffer_load_i32_tfe(<4 x i32> inreg %rsrc, ptr addrspace
; GFX11-LABEL: buffer_load_i32_tfe:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: ; return to shader part epilog
;
+; NOPRT-LABEL: buffer_load_i32_tfe:
+; NOPRT: ; %bb.0:
+; NOPRT-NEXT: v_mov_b32_e32 v3, 0
+; NOPRT-NEXT: buffer_load_format_x v[2:3], v3, s[0:3], 0 idxen tfe
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: global_store_b32 v[0:1], v2, off
+; NOPRT-NEXT: v_mov_b32_e32 v0, v3
+; NOPRT-NEXT: ; return to shader part epilog
+;
; GFX12-LABEL: buffer_load_i32_tfe:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], null idxen tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
@@ -996,6 +1284,7 @@ define amdgpu_cs float @buffer_load_f32_tfe(<4 x i32> inreg %rsrc, ptr addrspace
; GFX6-LABEL: buffer_load_f32_tfe:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-NEXT: v_mov_b32_e32 v3, v2
; GFX6-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
@@ -1010,6 +1299,7 @@ define amdgpu_cs float @buffer_load_f32_tfe(<4 x i32> inreg %rsrc, ptr addrspace
; GFX8PLUS-LABEL: buffer_load_f32_tfe:
; GFX8PLUS: ; %bb.0:
; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2
; GFX8PLUS-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe
; GFX8PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX8PLUS-NEXT: flat_store_dword v[0:1], v2
@@ -1020,15 +1310,28 @@ define amdgpu_cs float @buffer_load_f32_tfe(<4 x i32> inreg %rsrc, ptr addrspace
; GFX11-LABEL: buffer_load_f32_tfe:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: ; return to shader part epilog
;
+; NOPRT-LABEL: buffer_load_f32_tfe:
+; NOPRT: ; %bb.0:
+; NOPRT-NEXT: v_mov_b32_e32 v3, 0
+; NOPRT-NEXT: buffer_load_format_x v[2:3], v3, s[0:3], 0 idxen tfe
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: global_store_b32 v[0:1], v2, off
+; NOPRT-NEXT: v_mov_b32_e32 v0, v3
+; NOPRT-NEXT: ; return to shader part epilog
+;
; GFX12-LABEL: buffer_load_f32_tfe:
; GFX12: ; %bb.0:
; GFX12-NEXT: v_mov_b32_e32 v2, 0
+; GFX12-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX12-NEXT: v_mov_b32_e32 v3, v2
; GFX12-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], null idxen tfe
; GFX12-NEXT: s_wait_loadcnt 0x0
; GFX12-NEXT: global_store_b32 v[0:1], v2, off
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll
index b0bd4e4..c5202b8 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.struct.ptr.buffer.load.format.ll
@@ -2,6 +2,7 @@
;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck --check-prefixes=GFX6 %s
;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck --check-prefixes=GFX8PLUS %s
;RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=GFX11 %s
+;RUN: llc < %s -mtriple=amdgcn -mattr=-enable-prt-strict-null -mcpu=gfx1100 -verify-machineinstrs | FileCheck --check-prefixes=NOPRT %s
define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(ptr addrspace(8) inreg) {
; GFX6-LABEL: buffer_load:
@@ -31,6 +32,16 @@ define amdgpu_ps {<4 x float>, <4 x float>, <4 x float>} @buffer_load(ptr addrsp
; GFX11-NEXT: buffer_load_format_xyzw v[8:11], v8, s[0:3], 0 idxen slc
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: v_mov_b32_e32 v8, 0
+; NOPRT-NEXT: s_clause 0x2
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v8, s[0:3], 0 idxen
+; NOPRT-NEXT: buffer_load_format_xyzw v[4:7], v8, s[0:3], 0 idxen glc
+; NOPRT-NEXT: buffer_load_format_xyzw v[8:11], v8, s[0:3], 0 idxen slc
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 0)
%data_glc = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 0, i32 0, i32 1)
@@ -62,6 +73,13 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs(ptr addrspace(8) inreg) {
; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:42
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_immoffs:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: v_mov_b32_e32 v0, 0
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:42
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 42, i32 0, i32 0)
ret <4 x float> %data
@@ -126,6 +144,25 @@ define amdgpu_ps <4 x float> @buffer_load_immoffs_large(ptr addrspace(8) inreg)
; GFX11-NEXT: v_dual_add_f32 v0, v8, v0 :: v_dual_add_f32 v3, v11, v3
; GFX11-NEXT: v_add_f32_e32 v2, v10, v2
; GFX11-NEXT: ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_immoffs_large:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: v_mov_b32_e32 v8, 0
+; NOPRT-NEXT: s_movk_i32 s4, 0x7ffc
+; NOPRT-NEXT: s_clause 0x1
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v8, s[0:3], 60 idxen offset:4092
+; NOPRT-NEXT: buffer_load_format_xyzw v[4:7], v8, s[0:3], s4 idxen offset:4092
+; NOPRT-NEXT: s_mov_b32 s4, 0x8ffc
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: v_add_f32_e32 v1, v1, v5
+; NOPRT-NEXT: buffer_load_format_xyzw v[8:11], v8, s[0:3], s4 idxen offset:4
+; NOPRT-NEXT: v_dual_add_f32 v0, v0, v4 :: v_dual_add_f32 v3, v3, v7
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v1, v9, v1
+; NOPRT-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; NOPRT-NEXT: v_dual_add_f32 v0, v8, v0 :: v_dual_add_f32 v3, v11, v3
+; NOPRT-NEXT: v_add_f32_e32 v2, v10, v2
+; NOPRT-NEXT: ; return to shader part epilog
main_body:
%d.0 = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 4092, i32 60, i32 0)
%d.1 = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 4092, i32 32764, i32 0)
@@ -156,6 +193,13 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_12bit(ptr addrspace(8) i
; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:4092
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_voffset_large_12bit:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: v_mov_b32_e32 v0, 0
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen offset:4092
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 4092, i32 0, i32 0)
ret <4 x float> %data
@@ -188,6 +232,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_13bit(ptr addrspace(8) i
; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_voffset_large_13bit:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: s_mov_b32 s4, 0
+; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT: v_dual_mov_b32 v1, 0x1000 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 8188, i32 0, i32 0)
ret <4 x float> %data
@@ -220,6 +273,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_16bit(ptr addrspace(8) i
; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_voffset_large_16bit:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: s_mov_b32 s4, 0
+; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT: v_dual_mov_b32 v1, 0xf000 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 65532, i32 0, i32 0)
ret <4 x float> %data
@@ -252,6 +314,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_23bit(ptr addrspace(8) i
; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_voffset_large_23bit:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: s_mov_b32 s4, 0
+; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT: v_dual_mov_b32 v1, 0x7ff000 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 8388604, i32 0, i32 0)
ret <4 x float> %data
@@ -284,6 +355,15 @@ define amdgpu_ps <4 x float> @buffer_load_voffset_large_24bit(ptr addrspace(8) i
; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_voffset_large_24bit:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: s_mov_b32 s4, 0
+; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT: v_dual_mov_b32 v1, 0xfff000 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:4092
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 16777212, i32 0, i32 0)
ret <4 x float> %data
@@ -307,6 +387,12 @@ define amdgpu_ps <4 x float> @buffer_load_idx(ptr addrspace(8) inreg, i32) {
; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_idx:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v0, s[0:3], 0 idxen
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %1, i32 0, i32 0, i32 0)
ret <4 x float> %data
@@ -339,6 +425,15 @@ define amdgpu_ps <4 x float> @buffer_load_ofs(ptr addrspace(8) inreg, i32) {
; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_ofs:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: s_mov_b32 s4, 0
+; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 %1, i32 0, i32 0)
ret <4 x float> %data
@@ -371,6 +466,15 @@ define amdgpu_ps <4 x float> @buffer_load_ofs_imm(ptr addrspace(8) inreg, i32) {
; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_ofs_imm:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: s_mov_b32 s4, 0
+; NOPRT-NEXT: s_delay_alu instid0(SALU_CYCLE_1)
+; NOPRT-NEXT: v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, s4
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen offset:60
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
main_body:
%ofs = add i32 %1, 60
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 0, i32 %ofs, i32 0, i32 0)
@@ -395,6 +499,12 @@ define amdgpu_ps <4 x float> @buffer_load_both(ptr addrspace(8) inreg, i32, i32)
; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_both:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[0:1], s[0:3], 0 idxen offen
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %1, i32 %2, i32 0, i32 0)
ret <4 x float> %data
@@ -421,6 +531,13 @@ define amdgpu_ps <4 x float> @buffer_load_both_reversed(ptr addrspace(8) inreg,
; GFX11-NEXT: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_both_reversed:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: v_mov_b32_e32 v2, v0
+; NOPRT-NEXT: buffer_load_format_xyzw v[0:3], v[1:2], s[0:3], 0 idxen offen
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
main_body:
%data = call <4 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v4f32(ptr addrspace(8) %0, i32 %2, i32 %1, i32 0, i32 0)
ret <4 x float> %data
@@ -447,6 +564,13 @@ define amdgpu_ps float @buffer_load_x(ptr addrspace(8) inreg %rsrc) {
; GFX11-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_x:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: v_mov_b32_e32 v0, 0
+; NOPRT-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
main_body:
%data = call float @llvm.amdgcn.struct.ptr.buffer.load.format.f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
ret float %data
@@ -473,6 +597,13 @@ define amdgpu_ps float @buffer_load_x_i32(ptr addrspace(8) inreg %rsrc) {
; GFX11-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_x_i32:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: v_mov_b32_e32 v0, 0
+; NOPRT-NEXT: buffer_load_format_x v0, v0, s[0:3], 0 idxen
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
main_body:
%data = call i32 @llvm.amdgcn.struct.ptr.buffer.load.format.i32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
%fdata = bitcast i32 %data to float
@@ -500,6 +631,13 @@ define amdgpu_ps <2 x float> @buffer_load_xy(ptr addrspace(8) inreg %rsrc) {
; GFX11-NEXT: buffer_load_format_xy v[0:1], v0, s[0:3], 0 idxen
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_xy:
+; NOPRT: ; %bb.0: ; %main_body
+; NOPRT-NEXT: v_mov_b32_e32 v0, 0
+; NOPRT-NEXT: buffer_load_format_xy v[0:1], v0, s[0:3], 0 idxen
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: ; return to shader part epilog
main_body:
%data = call <2 x float> @llvm.amdgcn.struct.ptr.buffer.load.format.v2f32(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
ret <2 x float> %data
@@ -509,6 +647,10 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(ptr addrspace(8) inreg %rsrc, ptr
; GFX6-LABEL: buffer_load_v4i32_tfe:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-NEXT: v_mov_b32_e32 v3, v2
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v6, v2
; GFX6-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
@@ -523,6 +665,10 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(ptr addrspace(8) inreg %rsrc, ptr
; GFX8PLUS-LABEL: buffer_load_v4i32_tfe:
; GFX8PLUS: ; %bb.0:
; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2
+; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2
+; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2
+; GFX8PLUS-NEXT: v_mov_b32_e32 v6, v2
; GFX8PLUS-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
; GFX8PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX8PLUS-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
@@ -533,11 +679,25 @@ define amdgpu_cs float @buffer_load_v4i32_tfe(ptr addrspace(8) inreg %rsrc, ptr
; GFX11-LABEL: buffer_load_v4i32_tfe:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-NEXT: v_mov_b32_e32 v5, v2
+; GFX11-NEXT: v_mov_b32_e32 v6, v2
; GFX11-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-NEXT: v_mov_b32_e32 v0, v6
; GFX11-NEXT: ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_v4i32_tfe:
+; NOPRT: ; %bb.0:
+; NOPRT-NEXT: v_mov_b32_e32 v6, 0
+; NOPRT-NEXT: buffer_load_format_xyzw v[2:6], v6, s[0:3], 0 idxen tfe
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: global_store_b128 v[0:1], v[2:5], off
+; NOPRT-NEXT: v_mov_b32_e32 v0, v6
+; NOPRT-NEXT: ; return to shader part epilog
%load = call { <4 x i32>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v4i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { <4 x i32>, i32 } %load, 0
store <4 x i32> %data, ptr addrspace(1) %out
@@ -550,6 +710,10 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(ptr addrspace(8) inreg %rsrc, ptr
; GFX6-LABEL: buffer_load_v4f32_tfe:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-NEXT: v_mov_b32_e32 v3, v2
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v2
+; GFX6-NEXT: v_mov_b32_e32 v6, v2
; GFX6-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
@@ -564,6 +728,10 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(ptr addrspace(8) inreg %rsrc, ptr
; GFX8PLUS-LABEL: buffer_load_v4f32_tfe:
; GFX8PLUS: ; %bb.0:
; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2
+; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2
+; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2
+; GFX8PLUS-NEXT: v_mov_b32_e32 v6, v2
; GFX8PLUS-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
; GFX8PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX8PLUS-NEXT: flat_store_dwordx4 v[0:1], v[2:5]
@@ -574,11 +742,25 @@ define amdgpu_cs float @buffer_load_v4f32_tfe(ptr addrspace(8) inreg %rsrc, ptr
; GFX11-LABEL: buffer_load_v4f32_tfe:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-NEXT: v_mov_b32_e32 v5, v2
+; GFX11-NEXT: v_mov_b32_e32 v6, v2
; GFX11-NEXT: buffer_load_format_xyzw v[2:6], v2, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b128 v[0:1], v[2:5], off
; GFX11-NEXT: v_mov_b32_e32 v0, v6
; GFX11-NEXT: ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_v4f32_tfe:
+; NOPRT: ; %bb.0:
+; NOPRT-NEXT: v_mov_b32_e32 v6, 0
+; NOPRT-NEXT: buffer_load_format_xyzw v[2:6], v6, s[0:3], 0 idxen tfe
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: global_store_b128 v[0:1], v[2:5], off
+; NOPRT-NEXT: v_mov_b32_e32 v0, v6
+; NOPRT-NEXT: ; return to shader part epilog
%load = call { <4 x float>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v4f32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { <4 x float>, i32 } %load, 0
store <4 x float> %data, ptr addrspace(1) %out
@@ -591,6 +773,9 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(ptr addrspace(8) inreg %rsrc, ptr
; GFX6-LABEL: buffer_load_v3i32_tfe:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-NEXT: v_mov_b32_e32 v3, v2
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v2
; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
@@ -606,6 +791,9 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(ptr addrspace(8) inreg %rsrc, ptr
; GFX8PLUS-LABEL: buffer_load_v3i32_tfe:
; GFX8PLUS: ; %bb.0:
; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2
+; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2
+; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2
; GFX8PLUS-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
; GFX8PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX8PLUS-NEXT: flat_store_dwordx3 v[0:1], v[2:4]
@@ -616,11 +804,24 @@ define amdgpu_cs float @buffer_load_v3i32_tfe(ptr addrspace(8) inreg %rsrc, ptr
; GFX11-LABEL: buffer_load_v3i32_tfe:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-NEXT: v_mov_b32_e32 v5, v2
; GFX11-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off
; GFX11-NEXT: v_mov_b32_e32 v0, v5
; GFX11-NEXT: ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_v3i32_tfe:
+; NOPRT: ; %bb.0:
+; NOPRT-NEXT: v_mov_b32_e32 v5, 0
+; NOPRT-NEXT: buffer_load_format_xyz v[2:5], v5, s[0:3], 0 idxen tfe
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: global_store_b96 v[0:1], v[2:4], off
+; NOPRT-NEXT: v_mov_b32_e32 v0, v5
+; NOPRT-NEXT: ; return to shader part epilog
%load = call { <3 x i32>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v3i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { <3 x i32>, i32 } %load, 0
store <3 x i32> %data, ptr addrspace(1) %out
@@ -633,6 +834,9 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(ptr addrspace(8) inreg %rsrc, ptr
; GFX6-LABEL: buffer_load_v3f32_tfe:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-NEXT: v_mov_b32_e32 v3, v2
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v2
; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
@@ -648,6 +852,9 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(ptr addrspace(8) inreg %rsrc, ptr
; GFX8PLUS-LABEL: buffer_load_v3f32_tfe:
; GFX8PLUS: ; %bb.0:
; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2
+; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2
+; GFX8PLUS-NEXT: v_mov_b32_e32 v5, v2
; GFX8PLUS-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
; GFX8PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX8PLUS-NEXT: flat_store_dwordx3 v[0:1], v[2:4]
@@ -658,11 +865,24 @@ define amdgpu_cs float @buffer_load_v3f32_tfe(ptr addrspace(8) inreg %rsrc, ptr
; GFX11-LABEL: buffer_load_v3f32_tfe:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: v_mov_b32_e32 v4, v2
+; GFX11-NEXT: v_mov_b32_e32 v5, v2
; GFX11-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b96 v[0:1], v[2:4], off
; GFX11-NEXT: v_mov_b32_e32 v0, v5
; GFX11-NEXT: ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_v3f32_tfe:
+; NOPRT: ; %bb.0:
+; NOPRT-NEXT: v_mov_b32_e32 v5, 0
+; NOPRT-NEXT: buffer_load_format_xyz v[2:5], v5, s[0:3], 0 idxen tfe
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: global_store_b96 v[0:1], v[2:4], off
+; NOPRT-NEXT: v_mov_b32_e32 v0, v5
+; NOPRT-NEXT: ; return to shader part epilog
%load = call { <3 x float>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v3f32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { <3 x float>, i32 } %load, 0
store <3 x float> %data, ptr addrspace(1) %out
@@ -675,6 +895,9 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(ptr addrspace(8) inreg %rsrc, ptr
; GFX6-LABEL: buffer_load_v2i32_tfe:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-NEXT: v_mov_b32_e32 v3, v2
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v2
; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
@@ -689,6 +912,8 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(ptr addrspace(8) inreg %rsrc, ptr
; GFX8PLUS-LABEL: buffer_load_v2i32_tfe:
; GFX8PLUS: ; %bb.0:
; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2
+; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2
; GFX8PLUS-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe
; GFX8PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX8PLUS-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -699,11 +924,23 @@ define amdgpu_cs float @buffer_load_v2i32_tfe(ptr addrspace(8) inreg %rsrc, ptr
; GFX11-LABEL: buffer_load_v2i32_tfe:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: v_mov_b32_e32 v4, v2
; GFX11-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX11-NEXT: v_mov_b32_e32 v0, v4
; GFX11-NEXT: ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_v2i32_tfe:
+; NOPRT: ; %bb.0:
+; NOPRT-NEXT: v_mov_b32_e32 v4, 0
+; NOPRT-NEXT: buffer_load_format_xy v[2:4], v4, s[0:3], 0 idxen tfe
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: global_store_b64 v[0:1], v[2:3], off
+; NOPRT-NEXT: v_mov_b32_e32 v0, v4
+; NOPRT-NEXT: ; return to shader part epilog
%load = call { <2 x i32>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v2i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { <2 x i32>, i32 } %load, 0
store <2 x i32> %data, ptr addrspace(1) %out
@@ -716,6 +953,9 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(ptr addrspace(8) inreg %rsrc, ptr
; GFX6-LABEL: buffer_load_v2f32_tfe:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-NEXT: v_mov_b32_e32 v3, v2
+; GFX6-NEXT: v_mov_b32_e32 v4, v2
+; GFX6-NEXT: v_mov_b32_e32 v5, v2
; GFX6-NEXT: buffer_load_format_xyz v[2:5], v2, s[0:3], 0 idxen tfe
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
@@ -730,6 +970,8 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(ptr addrspace(8) inreg %rsrc, ptr
; GFX8PLUS-LABEL: buffer_load_v2f32_tfe:
; GFX8PLUS: ; %bb.0:
; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2
+; GFX8PLUS-NEXT: v_mov_b32_e32 v4, v2
; GFX8PLUS-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe
; GFX8PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX8PLUS-NEXT: flat_store_dwordx2 v[0:1], v[2:3]
@@ -740,11 +982,23 @@ define amdgpu_cs float @buffer_load_v2f32_tfe(ptr addrspace(8) inreg %rsrc, ptr
; GFX11-LABEL: buffer_load_v2f32_tfe:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
+; GFX11-NEXT: v_mov_b32_e32 v4, v2
; GFX11-NEXT: buffer_load_format_xy v[2:4], v2, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b64 v[0:1], v[2:3], off
; GFX11-NEXT: v_mov_b32_e32 v0, v4
; GFX11-NEXT: ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_v2f32_tfe:
+; NOPRT: ; %bb.0:
+; NOPRT-NEXT: v_mov_b32_e32 v4, 0
+; NOPRT-NEXT: buffer_load_format_xy v[2:4], v4, s[0:3], 0 idxen tfe
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: global_store_b64 v[0:1], v[2:3], off
+; NOPRT-NEXT: v_mov_b32_e32 v0, v4
+; NOPRT-NEXT: ; return to shader part epilog
%load = call { <2 x float>, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_v2f32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { <2 x float>, i32 } %load, 0
store <2 x float> %data, ptr addrspace(1) %out
@@ -757,6 +1011,7 @@ define amdgpu_cs float @buffer_load_i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad
; GFX6-LABEL: buffer_load_i32_tfe:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-NEXT: v_mov_b32_e32 v3, v2
; GFX6-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
@@ -771,6 +1026,7 @@ define amdgpu_cs float @buffer_load_i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad
; GFX8PLUS-LABEL: buffer_load_i32_tfe:
; GFX8PLUS: ; %bb.0:
; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2
; GFX8PLUS-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe
; GFX8PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX8PLUS-NEXT: flat_store_dword v[0:1], v2
@@ -781,11 +1037,22 @@ define amdgpu_cs float @buffer_load_i32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad
; GFX11-LABEL: buffer_load_i32_tfe:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_i32_tfe:
+; NOPRT: ; %bb.0:
+; NOPRT-NEXT: v_mov_b32_e32 v3, 0
+; NOPRT-NEXT: buffer_load_format_x v[2:3], v3, s[0:3], 0 idxen tfe
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: global_store_b32 v[0:1], v2, off
+; NOPRT-NEXT: v_mov_b32_e32 v0, v3
+; NOPRT-NEXT: ; return to shader part epilog
%load = call { i32, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_i32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { i32, i32 } %load, 0
store i32 %data, ptr addrspace(1) %out
@@ -798,6 +1065,7 @@ define amdgpu_cs float @buffer_load_f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad
; GFX6-LABEL: buffer_load_f32_tfe:
; GFX6: ; %bb.0:
; GFX6-NEXT: v_mov_b32_e32 v2, 0
+; GFX6-NEXT: v_mov_b32_e32 v3, v2
; GFX6-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe
; GFX6-NEXT: s_mov_b32 s2, 0
; GFX6-NEXT: s_mov_b32 s3, 0xf000
@@ -812,6 +1080,7 @@ define amdgpu_cs float @buffer_load_f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad
; GFX8PLUS-LABEL: buffer_load_f32_tfe:
; GFX8PLUS: ; %bb.0:
; GFX8PLUS-NEXT: v_mov_b32_e32 v2, 0
+; GFX8PLUS-NEXT: v_mov_b32_e32 v3, v2
; GFX8PLUS-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe
; GFX8PLUS-NEXT: s_waitcnt vmcnt(0)
; GFX8PLUS-NEXT: flat_store_dword v[0:1], v2
@@ -822,11 +1091,22 @@ define amdgpu_cs float @buffer_load_f32_tfe(ptr addrspace(8) inreg %rsrc, ptr ad
; GFX11-LABEL: buffer_load_f32_tfe:
; GFX11: ; %bb.0:
; GFX11-NEXT: v_mov_b32_e32 v2, 0
+; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT: v_mov_b32_e32 v3, v2
; GFX11-NEXT: buffer_load_format_x v[2:3], v2, s[0:3], 0 idxen tfe
; GFX11-NEXT: s_waitcnt vmcnt(0)
; GFX11-NEXT: global_store_b32 v[0:1], v2, off
; GFX11-NEXT: v_mov_b32_e32 v0, v3
; GFX11-NEXT: ; return to shader part epilog
+;
+; NOPRT-LABEL: buffer_load_f32_tfe:
+; NOPRT: ; %bb.0:
+; NOPRT-NEXT: v_mov_b32_e32 v3, 0
+; NOPRT-NEXT: buffer_load_format_x v[2:3], v3, s[0:3], 0 idxen tfe
+; NOPRT-NEXT: s_waitcnt vmcnt(0)
+; NOPRT-NEXT: global_store_b32 v[0:1], v2, off
+; NOPRT-NEXT: v_mov_b32_e32 v0, v3
+; NOPRT-NEXT: ; return to shader part epilog
%load = call { float, i32 } @llvm.amdgcn.struct.ptr.buffer.load.format.sl_f32i32s(ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0, i32 0)
%data = extractvalue { float, i32 } %load, 0
store float %data, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
index ab7ab4d..d056a97 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.maxnum.f16.ll
@@ -32,8 +32,6 @@ define amdgpu_kernel void @maxnum_f16(
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_max_f32_e32 v0, v0, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -170,7 +168,6 @@ define amdgpu_kernel void @maxnum_f16_imm_a(
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
@@ -279,7 +276,6 @@ define amdgpu_kernel void @maxnum_f16_imm_b(
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_max_f32_e32 v0, 4.0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
@@ -384,21 +380,17 @@ define amdgpu_kernel void @maxnum_v2f16(
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s1, s2, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v1, s0
-; SI-NEXT: s_lshr_b32 s0, s0, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s0
-; SI-NEXT: v_cvt_f32_f16_e32 v3, s1
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_max_f32_e32 v2, v3, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: s_lshr_b32 s3, s0, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s3
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s2
+; SI-NEXT: v_cvt_f32_f16_e32 v3, s0
; SI-NEXT: v_max_f32_e32 v0, v0, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_max_f32_e32 v1, v2, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -497,20 +489,18 @@ define amdgpu_kernel void @maxnum_v2f16_imm_a(
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s2, s[2:3], 0x0
-; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; SI-NEXT: s_lshr_b32 s2, s2, 16
+; SI-NEXT: s_lshr_b32 s3, s2, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s3
; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_max_f32_e32 v1, 4.0, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_max_f32_e32 v0, 4.0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -589,20 +579,18 @@ define amdgpu_kernel void @maxnum_v2f16_imm_b(
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s2, s[2:3], 0x0
-; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; SI-NEXT: s_lshr_b32 s2, s2, 16
+; SI-NEXT: s_lshr_b32 s3, s2, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s3
; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_max_f32_e32 v0, 4.0, v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_max_f32_e32 v0, 0x40400000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_max_f32_e32 v1, 4.0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -688,27 +676,21 @@ define amdgpu_kernel void @maxnum_v3f16(
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, s3
-; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
-; SI-NEXT: s_lshr_b32 s2, s2, 16
-; SI-NEXT: s_lshr_b32 s3, s0, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s3
+; SI-NEXT: s_lshr_b32 s3, s2, 16
+; SI-NEXT: s_lshr_b32 s8, s0, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s3
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s8
; SI-NEXT: v_cvt_f32_f16_e32 v3, s2
-; SI-NEXT: v_cvt_f32_f16_e32 v5, s0
-; SI-NEXT: v_cvt_f32_f16_e32 v4, s1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT: v_max_f32_e32 v2, v3, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_max_f32_e32 v1, v1, v3
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_max_f32_e32 v0, v0, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s0
+; SI-NEXT: v_cvt_f32_f16_e32 v5, s1
+; SI-NEXT: v_max_f32_e32 v1, v1, v2
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_max_f32_e32 v2, v3, v4
+; SI-NEXT: v_max_f32_e32 v0, v0, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -837,25 +819,17 @@ define amdgpu_kernel void @maxnum_v4f16(
; SI-NEXT: v_cvt_f32_f16_e32 v2, s6
; SI-NEXT: s_lshr_b32 s6, s7, 16
; SI-NEXT: v_cvt_f32_f16_e32 v3, s6
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s4
; SI-NEXT: s_lshr_b32 s6, s5, 16
+; SI-NEXT: s_lshr_b32 s4, s4, 16
; SI-NEXT: v_cvt_f32_f16_e32 v5, s6
+; SI-NEXT: v_cvt_f32_f16_e32 v7, s4
; SI-NEXT: v_cvt_f32_f16_e32 v1, s7
-; SI-NEXT: v_cvt_f32_f16_e32 v4, s4
-; SI-NEXT: s_lshr_b32 s4, s4, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v7, s5
-; SI-NEXT: v_cvt_f32_f16_e32 v6, s4
-; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v6, s5
; SI-NEXT: v_max_f32_e32 v3, v3, v5
-; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_max_f32_e32 v1, v1, v5
-; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_max_f32_e32 v2, v2, v5
-; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_max_f32_e32 v2, v2, v7
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: v_max_f32_e32 v1, v1, v6
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_max_f32_e32 v0, v0, v4
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -986,20 +960,16 @@ define amdgpu_kernel void @fmax_v4f16_imm_a(
; SI-NEXT: v_cvt_f32_f16_e32 v1, s5
; SI-NEXT: s_lshr_b32 s5, s5, 16
; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s5
; SI-NEXT: s_lshr_b32 s4, s4, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s5
; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_max_f32_e32 v2, 4.0, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-NEXT: v_max_f32_e32 v1, 0x40400000, v1
+; SI-NEXT: v_max_f32_e32 v0, 0x41000000, v0
+; SI-NEXT: v_max_f32_e32 v2, 4.0, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_max_f32_e32 v3, 2.0, v3
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_max_f32_e32 v0, 0x41000000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
index b7370ce..f934a2d 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.minnum.f16.ll
@@ -32,8 +32,6 @@ define amdgpu_kernel void @minnum_f16_ieee(
; SI-NEXT: s_mov_b32 s1, s5
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
; SI-NEXT: v_min_f32_e32 v0, v0, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[0:3], 0
@@ -197,7 +195,6 @@ define amdgpu_kernel void @minnum_f16_imm_a(
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
@@ -305,7 +302,6 @@ define amdgpu_kernel void @minnum_f16_imm_b(
; SI-NEXT: s_mov_b32 s5, s1
; SI-NEXT: s_waitcnt vmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, v0
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_min_f32_e32 v0, 4.0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0
@@ -409,21 +405,17 @@ define amdgpu_kernel void @minnum_v2f16_ieee(
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_lshr_b32 s1, s2, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v1, s0
-; SI-NEXT: s_lshr_b32 s0, s0, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s0
-; SI-NEXT: v_cvt_f32_f16_e32 v3, s1
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_min_f32_e32 v2, v3, v2
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
+; SI-NEXT: s_lshr_b32 s3, s0, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s1
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s3
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s2
+; SI-NEXT: v_cvt_f32_f16_e32 v3, s0
; SI-NEXT: v_min_f32_e32 v0, v0, v1
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v2
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_min_f32_e32 v1, v2, v3
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[4:7], 0
; SI-NEXT: s_endpgm
;
@@ -556,20 +548,18 @@ define amdgpu_kernel void @minnum_v2f16_imm_a(
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s2, s[2:3], 0x0
-; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; SI-NEXT: s_lshr_b32 s2, s2, 16
+; SI-NEXT: s_lshr_b32 s3, s2, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s3
; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_min_f32_e32 v1, 4.0, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_min_f32_e32 v0, 4.0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -647,20 +637,18 @@ define amdgpu_kernel void @minnum_v2f16_imm_b(
; SI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x9
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: s_load_dword s2, s[2:3], 0x0
-; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_waitcnt lgkmcnt(0)
-; SI-NEXT: v_cvt_f32_f16_e32 v0, s2
-; SI-NEXT: s_lshr_b32 s2, s2, 16
+; SI-NEXT: s_lshr_b32 s3, s2, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v0, s3
; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
+; SI-NEXT: s_mov_b32 s3, 0xf000
; SI-NEXT: s_mov_b32 s2, -1
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_min_f32_e32 v0, 4.0, v0
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1
-; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_min_f32_e32 v0, 0x40400000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
-; SI-NEXT: v_or_b32_e32 v0, v0, v1
+; SI-NEXT: v_min_f32_e32 v1, 4.0, v1
+; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0
+; SI-NEXT: v_or_b32_e32 v0, v1, v0
; SI-NEXT: buffer_store_dword v0, off, s[0:3], 0
; SI-NEXT: s_endpgm
;
@@ -745,27 +733,21 @@ define amdgpu_kernel void @minnum_v3f16(
; SI-NEXT: s_mov_b32 s6, -1
; SI-NEXT: s_waitcnt lgkmcnt(0)
; SI-NEXT: v_cvt_f32_f16_e32 v0, s3
-; SI-NEXT: v_cvt_f32_f16_e32 v1, s2
-; SI-NEXT: s_lshr_b32 s2, s2, 16
-; SI-NEXT: s_lshr_b32 s3, s0, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s3
+; SI-NEXT: s_lshr_b32 s3, s2, 16
+; SI-NEXT: s_lshr_b32 s8, s0, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v1, s3
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s8
; SI-NEXT: v_cvt_f32_f16_e32 v3, s2
-; SI-NEXT: v_cvt_f32_f16_e32 v5, s0
-; SI-NEXT: v_cvt_f32_f16_e32 v4, s1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
-; SI-NEXT: v_min_f32_e32 v2, v3, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_min_f32_e32 v1, v1, v3
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
-; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
-; SI-NEXT: v_min_f32_e32 v0, v0, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s0
+; SI-NEXT: v_cvt_f32_f16_e32 v5, s1
+; SI-NEXT: v_min_f32_e32 v1, v1, v2
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
+; SI-NEXT: v_min_f32_e32 v2, v3, v4
+; SI-NEXT: v_min_f32_e32 v0, v0, v5
+; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
-; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
-; SI-NEXT: v_or_b32_e32 v1, v1, v2
+; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1
+; SI-NEXT: v_or_b32_e32 v1, v2, v1
; SI-NEXT: buffer_store_short v0, off, s[4:7], 0 offset:4
; SI-NEXT: buffer_store_dword v1, off, s[4:7], 0
; SI-NEXT: s_endpgm
@@ -893,25 +875,17 @@ define amdgpu_kernel void @minnum_v4f16(
; SI-NEXT: v_cvt_f32_f16_e32 v2, s6
; SI-NEXT: s_lshr_b32 s6, s7, 16
; SI-NEXT: v_cvt_f32_f16_e32 v3, s6
+; SI-NEXT: v_cvt_f32_f16_e32 v4, s4
; SI-NEXT: s_lshr_b32 s6, s5, 16
+; SI-NEXT: s_lshr_b32 s4, s4, 16
; SI-NEXT: v_cvt_f32_f16_e32 v5, s6
+; SI-NEXT: v_cvt_f32_f16_e32 v7, s4
; SI-NEXT: v_cvt_f32_f16_e32 v1, s7
-; SI-NEXT: v_cvt_f32_f16_e32 v4, s4
-; SI-NEXT: s_lshr_b32 s4, s4, 16
-; SI-NEXT: v_cvt_f32_f16_e32 v7, s5
-; SI-NEXT: v_cvt_f32_f16_e32 v6, s4
-; SI-NEXT: v_mul_f32_e32 v5, 1.0, v5
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
+; SI-NEXT: v_cvt_f32_f16_e32 v6, s5
; SI-NEXT: v_min_f32_e32 v3, v3, v5
-; SI-NEXT: v_mul_f32_e32 v5, 1.0, v7
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_min_f32_e32 v1, v1, v5
-; SI-NEXT: v_mul_f32_e32 v5, 1.0, v6
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_min_f32_e32 v2, v2, v5
-; SI-NEXT: v_mul_f32_e32 v4, 1.0, v4
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
+; SI-NEXT: v_min_f32_e32 v2, v2, v7
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
+; SI-NEXT: v_min_f32_e32 v1, v1, v6
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_min_f32_e32 v0, v0, v4
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
@@ -1041,20 +1015,16 @@ define amdgpu_kernel void @fmin_v4f16_imm_a(
; SI-NEXT: v_cvt_f32_f16_e32 v1, s5
; SI-NEXT: s_lshr_b32 s5, s5, 16
; SI-NEXT: v_cvt_f32_f16_e32 v0, s4
-; SI-NEXT: v_cvt_f32_f16_e32 v2, s5
; SI-NEXT: s_lshr_b32 s4, s4, 16
+; SI-NEXT: v_cvt_f32_f16_e32 v2, s5
; SI-NEXT: v_cvt_f32_f16_e32 v3, s4
-; SI-NEXT: v_mul_f32_e32 v1, 1.0, v1
-; SI-NEXT: v_mul_f32_e32 v2, 1.0, v2
-; SI-NEXT: v_min_f32_e32 v2, 4.0, v2
-; SI-NEXT: v_mul_f32_e32 v3, 1.0, v3
; SI-NEXT: v_min_f32_e32 v1, 0x40400000, v1
+; SI-NEXT: v_min_f32_e32 v0, 0x41000000, v0
+; SI-NEXT: v_min_f32_e32 v2, 4.0, v2
; SI-NEXT: v_cvt_f16_f32_e32 v2, v2
; SI-NEXT: v_min_f32_e32 v3, 2.0, v3
-; SI-NEXT: v_mul_f32_e32 v0, 1.0, v0
; SI-NEXT: v_cvt_f16_f32_e32 v1, v1
; SI-NEXT: v_cvt_f16_f32_e32 v3, v3
-; SI-NEXT: v_min_f32_e32 v0, 0x41000000, v0
; SI-NEXT: v_cvt_f16_f32_e32 v0, v0
; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2
; SI-NEXT: v_or_b32_e32 v1, v1, v2
diff --git a/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll b/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll
new file mode 100644
index 0000000..f1d9463
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/lto-lower-module-lds.ll
@@ -0,0 +1,47 @@
+
+; Default O0
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -O0 -cg-opt-level 0 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Unified O0
+; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -unified-lto=full -O0 -cg-opt-level 0 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Default O1
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -O1 -cg-opt-level 1 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Unified O1
+; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -unified-lto=full -O1 -cg-opt-level 1 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Default O2
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -O2 -cg-opt-level 2 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Unified O2
+; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -unified-lto=full -O2 -cg-opt-level 2 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Default O3
+; RUN: opt -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -O3 -cg-opt-level 3 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; Unified O3
+; RUN: opt -unified-lto -thinlto-split-lto-unit -thinlto-bc -mtriple=amdgcn-- -mcpu=gfx1030 %s -o %t.bc
+; RUN: llvm-lto2 run -unified-lto=full -O3 -cg-opt-level 3 %t.bc -o %t.s -r %t.bc,test,px -debug-pass-manager -debug-pass=Structure 2>&1 | FileCheck %s
+
+; First print will be from the New PM during the full LTO pipeline.
+; Second print will be from the legacy PM during the CG pipeline.
+
+; CHECK: Running pass: AMDGPULowerModuleLDSPass on [module]
+; CHECK: ModulePass Manager
+; CHECK: Lower uses of LDS variables from non-kernel functions
+
+@lds = internal unnamed_addr addrspace(3) global i32 poison, align 4
+
+define amdgpu_kernel void @test() {
+entry:
+ store i32 1, ptr addrspace(3) @lds
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
index fb3e79b..5b7f0e7 100644
--- a/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
+++ b/llvm/test/CodeGen/AMDGPU/mad-mix-lo.ll
@@ -951,56 +951,70 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
; SDAG-GFX1100-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; SDAG-GFX1100: ; %bb.0:
; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-NEXT: v_pack_b32_f16 v1, v1, 0
-; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-NEXT: v_pack_b32_f16 v0, v1, 0
; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; SDAG-GFX1100-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v6, 0
+; SDAG-GFX1100-NEXT: v_pk_max_f16 v2, v0, 0
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-NEXT: v_pk_min_f16 v0, v1, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX1100-NEXT: v_pk_min_f16 v1, v2, 1.0 op_sel_hi:[1,0]
; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX900-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; SDAG-GFX900: ; %bb.0:
; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
; SDAG-GFX900-NEXT: v_pack_b32_f16 v1, v1, 0
-; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; SDAG-GFX900-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v1, 0
+; SDAG-GFX900-NEXT: v_pk_max_f16 v0, v6, 0
+; SDAG-GFX900-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX900-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-GFX906-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; SDAG-GFX906: ; %bb.0:
; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v1, v1, v3, v5 op_sel_hi:[1,1,1]
-; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v3, v0, v2, v4 op_sel_hi:[1,1,1] clamp
; SDAG-GFX906-NEXT: v_pack_b32_f16 v1, v1, 0
-; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v3, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v1, v1 clamp
-; SDAG-GFX906-NEXT: v_mov_b32_e32 v0, v3
+; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v1, 0
+; SDAG-GFX906-NEXT: v_pk_max_f16 v0, v6, 0
+; SDAG-GFX906-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX906-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-VI-LABEL: v_mad_mix_v3f32_clamp_postcvt:
; SDAG-VI: ; %bb.0:
; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v4
-; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SDAG-VI-NEXT: v_mac_f32_e32 v8, v6, v7
; SDAG-VI-NEXT: v_mac_f32_e32 v4, v0, v2
-; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v8 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v2, v4 clamp
; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3
-; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v1, v5 clamp
+; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v8
+; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v1, v4
+; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v2, v5
+; SDAG-VI-NEXT: v_max_f16_e32 v0, 0, v0
+; SDAG-VI-NEXT: v_max_f16_e32 v3, 0, v1
+; SDAG-VI-NEXT: v_max_f16_e32 v1, 0, v2
+; SDAG-VI-NEXT: v_mov_b32_e32 v2, 0x3c00
+; SDAG-VI-NEXT: v_min_f16_sdwa v0, v0, v2 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-VI-NEXT: v_min_f16_e32 v2, 1.0, v3
+; SDAG-VI-NEXT: v_min_f16_e32 v1, 1.0, v1
; SDAG-VI-NEXT: v_or_b32_e32 v0, v2, v0
; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
;
@@ -1139,63 +1153,80 @@ define <3 x half> @v_mad_mix_v3f32_clamp_postcvt(<3 x half> %src0, <3 x half> %s
}
define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %src1, <4 x half> %src2) #0 {
-; GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt:
-; GFX1100: ; %bb.0:
-; GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
-; GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
-; GFX1100-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; SDAG-GFX1100: ; %bb.0:
+; SDAG-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-NEXT: v_pk_max_f16 v0, v6, 0
+; SDAG-GFX1100-NEXT: v_pk_max_f16 v1, v7, 0
+; SDAG-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; SDAG-GFX1100-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX1100-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX1100-NEXT: s_setpc_b64 s[30:31]
;
-; GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt:
-; GFX900: ; %bb.0:
-; GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX900-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GFX900-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX900-NEXT: v_mov_b32_e32 v0, v6
-; GFX900-NEXT: v_mov_b32_e32 v1, v2
-; GFX900-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; SDAG-GFX900: ; %bb.0:
+; SDAG-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; SDAG-GFX900-NEXT: v_mad_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX900-NEXT: v_pk_max_f16 v1, v7, 0
+; SDAG-GFX900-NEXT: v_pk_max_f16 v0, v6, 0
+; SDAG-GFX900-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX900-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX900-NEXT: s_setpc_b64 s[30:31]
;
-; GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt:
-; GFX906: ; %bb.0:
-; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
-; GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX906-NEXT: v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
-; GFX906-NEXT: v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
-; GFX906-NEXT: v_mov_b32_e32 v0, v6
-; GFX906-NEXT: v_mov_b32_e32 v1, v2
-; GFX906-NEXT: s_setpc_b64 s[30:31]
+; SDAG-GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; SDAG-GFX906: ; %bb.0:
+; SDAG-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1]
+; SDAG-GFX906-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1]
+; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1]
+; SDAG-GFX906-NEXT: v_pk_max_f16 v1, v7, 0
+; SDAG-GFX906-NEXT: v_pk_max_f16 v0, v6, 0
+; SDAG-GFX906-NEXT: v_pk_min_f16 v0, v0, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX906-NEXT: v_pk_min_f16 v1, v1, 1.0 op_sel_hi:[1,0]
+; SDAG-GFX906-NEXT: s_setpc_b64 s[30:31]
;
; SDAG-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt:
; SDAG-VI: ; %bb.0:
; SDAG-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v6, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v7, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v1, v1
-; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v9, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v0, v0
+; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v3, v3
-; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v10, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
-; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v2, v2
+; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
+; SDAG-VI-NEXT: v_cvt_f32_f16_sdwa v11, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v4, v4
+; SDAG-VI-NEXT: v_cvt_f32_f16_e32 v5, v5
; SDAG-VI-NEXT: v_mac_f32_e32 v10, v7, v9
; SDAG-VI-NEXT: v_mac_f32_e32 v11, v6, v8
-; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3
; SDAG-VI-NEXT: v_mac_f32_e32 v4, v0, v2
-; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v0, v11 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; SDAG-VI-NEXT: v_cvt_f16_f32_sdwa v1, v10 clamp dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD
-; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v2, v4 clamp
-; SDAG-VI-NEXT: v_cvt_f16_f32_e64 v3, v5 clamp
+; SDAG-VI-NEXT: v_mac_f32_e32 v5, v1, v3
+; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v0, v10
+; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v1, v11
+; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v2, v4
+; SDAG-VI-NEXT: v_cvt_f16_f32_e32 v3, v5
+; SDAG-VI-NEXT: v_max_f16_e32 v0, 0, v0
+; SDAG-VI-NEXT: v_max_f16_e32 v1, 0, v1
+; SDAG-VI-NEXT: v_max_f16_e32 v2, 0, v2
+; SDAG-VI-NEXT: v_max_f16_e32 v3, 0, v3
+; SDAG-VI-NEXT: v_mov_b32_e32 v4, 0x3c00
+; SDAG-VI-NEXT: v_min_f16_sdwa v1, v1, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-VI-NEXT: v_min_f16_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; SDAG-VI-NEXT: v_min_f16_e32 v3, 1.0, v3
+; SDAG-VI-NEXT: v_min_f16_e32 v2, 1.0, v2
; SDAG-VI-NEXT: v_or_b32_e32 v0, v2, v0
; SDAG-VI-NEXT: v_or_b32_e32 v1, v3, v1
; SDAG-VI-NEXT: s_setpc_b64 s[30:31]
@@ -1241,6 +1272,40 @@ define <4 x half> @v_mad_mix_v4f32_clamp_postcvt(<4 x half> %src0, <4 x half> %s
; SDAG-CI-NEXT: v_cvt_f32_f16_e64 v3, v3 clamp
; SDAG-CI-NEXT: s_setpc_b64 s[30:31]
;
+; GISEL-GFX1100-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; GISEL-GFX1100: ; %bb.0:
+; GISEL-GFX1100-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_fma_mixlo_f16 v7, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: v_fma_mixhi_f16 v7, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX1100-NEXT: s_delay_alu instid0(VALU_DEP_1)
+; GISEL-GFX1100-NEXT: v_dual_mov_b32 v0, v6 :: v_dual_mov_b32 v1, v7
+; GISEL-GFX1100-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX900-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; GISEL-GFX900: ; %bb.0:
+; GISEL-GFX900-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX900-NEXT: v_mad_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX900-NEXT: v_mad_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX900-NEXT: v_mov_b32_e32 v0, v6
+; GISEL-GFX900-NEXT: v_mov_b32_e32 v1, v2
+; GISEL-GFX900-NEXT: s_setpc_b64 s[30:31]
+;
+; GISEL-GFX906-LABEL: v_mad_mix_v4f32_clamp_postcvt:
+; GISEL-GFX906: ; %bb.0:
+; GISEL-GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v6, v0, v2, v4 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v6, v0, v2, v4 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX906-NEXT: v_fma_mixlo_f16 v2, v1, v3, v5 op_sel_hi:[1,1,1] clamp
+; GISEL-GFX906-NEXT: v_fma_mixhi_f16 v2, v1, v3, v5 op_sel:[1,1,1] op_sel_hi:[1,1,1] clamp
+; GISEL-GFX906-NEXT: v_mov_b32_e32 v0, v6
+; GISEL-GFX906-NEXT: v_mov_b32_e32 v1, v2
+; GISEL-GFX906-NEXT: s_setpc_b64 s[30:31]
+;
; GISEL-VI-LABEL: v_mad_mix_v4f32_clamp_postcvt:
; GISEL-VI: ; %bb.0:
; GISEL-VI-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/merge-buffer-gfx12.mir b/llvm/test/CodeGen/AMDGPU/merge-buffer-gfx12.mir
new file mode 100644
index 0000000..d7f5d1a
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/merge-buffer-gfx12.mir
@@ -0,0 +1,1154 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1200 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefixes=GFX12 %s
+
+---
+name: buffer_load_dword_dwordx3
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dword_dwordx3
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]].sub0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]].sub1_sub2_sub3
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dwordx3_dword
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dwordx3_dword
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_96 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]].sub0_sub1_sub2
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]].sub3
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 16, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dwordx2_dwordx2
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]].sub0_sub1
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]].sub2_sub3
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dwordx2
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dword_dwordx2
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET]].sub0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET]].sub1_sub2
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dwordx2_dword
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dwordx2_dword
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET]].sub0_sub1
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET]].sub2
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+
+name: buffer_load_dword_dword
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dword_dword
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET]].sub0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET]].sub1
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_32
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dword_32
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET]].sub0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET]].sub1
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 16, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]].sub0_sub1_sub2
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_OFFSET]].sub3
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 36, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET]].sub0_sub1
+ ; GFX12-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_OFFSET]].sub2
+ ; GFX12-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
+ ; GFX12-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %9:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 16, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %10:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %11:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 24, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %12:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 28, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %13:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 36, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %14:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 40, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %15:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 44, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+#
+# buffer_store_dword
+#
+
+name: buffer_store_dword_xyz
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-LABEL: name: buffer_store_dword_xyz
+ ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
+ ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[REG_SEQUENCE1]], %subreg.sub1_sub2_sub3
+ ; GFX12-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+ %7:vgpr_32 = COPY $vgpr3
+ %6:vgpr_32 = COPY $vgpr2
+ %5:vgpr_32 = COPY $vgpr1
+ %4:vgpr_32 = COPY $vgpr0
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2
+ BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %7:vgpr_32, %13:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ BUFFER_STORE_DWORDX3_VBUFFER_OFFSET_exact %14:vreg_96, %13:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+...
+---
+
+name: buffer_store_dwordx3_dword
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-LABEL: name: buffer_store_dwordx3_dword
+ ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
+ ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1_sub2, [[COPY]], %subreg.sub3
+ ; GFX12-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+ %7:vgpr_32 = COPY $vgpr3
+ %6:vgpr_32 = COPY $vgpr2
+ %5:vgpr_32 = COPY $vgpr1
+ %4:vgpr_32 = COPY $vgpr0
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2
+ BUFFER_STORE_DWORDX3_VBUFFER_OFFSET_exact %14:vreg_96, %13:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+ BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %7:vgpr_32, %13:sgpr_128, $sgpr_null, 16, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_store_dwordx2_dwordx2
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-LABEL: name: buffer_store_dwordx2_dwordx2
+ ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
+ ; GFX12-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE3]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+ %7:vgpr_32 = COPY $vgpr3
+ %6:vgpr_32 = COPY $vgpr2
+ %5:vgpr_32 = COPY $vgpr1
+ %4:vgpr_32 = COPY $vgpr0
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
+ %15:vreg_64 = REG_SEQUENCE %6:vgpr_32, %subreg.sub0, %7:vgpr_32, %subreg.sub1
+ BUFFER_STORE_DWORDX2_VBUFFER_OFFSET_exact %14:vreg_64, %13:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+ BUFFER_STORE_DWORDX2_VBUFFER_OFFSET_exact %15:vreg_64, %13:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+...
+---
+
+name: buffer_store_dword_dwordx2
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-LABEL: name: buffer_store_dword_dwordx2
+ ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, %10:vreg_64, %subreg.sub1_sub2
+ ; GFX12-NEXT: BUFFER_STORE_DWORDX3_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+ %7:vgpr_32 = COPY $vgpr3
+ %6:vgpr_32 = COPY $vgpr2
+ %5:vgpr_32 = COPY $vgpr1
+ %4:vgpr_32 = COPY $vgpr0
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
+ BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %7:vgpr_32, %13:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ BUFFER_STORE_DWORDX2_VBUFFER_OFFSET_exact %15:vreg_64, %13:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+...
+---
+
+name: buffer_store_dwordx2_dword
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-LABEL: name: buffer_store_dwordx2_dword
+ ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
+ ; GFX12-NEXT: BUFFER_STORE_DWORDX3_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+ %7:vgpr_32 = COPY $vgpr3
+ %6:vgpr_32 = COPY $vgpr2
+ %5:vgpr_32 = COPY $vgpr1
+ %4:vgpr_32 = COPY $vgpr0
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
+ BUFFER_STORE_DWORDX2_VBUFFER_OFFSET_exact %14:vreg_64, %13:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+ BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %7:vgpr_32, %13:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_store_dword_dword
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-LABEL: name: buffer_store_dword_dword
+ ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GFX12-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+ %7:vgpr_32 = COPY $vgpr3
+ %6:vgpr_32 = COPY $vgpr2
+ %5:vgpr_32 = COPY $vgpr1
+ %4:vgpr_32 = COPY $vgpr0
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %6:vgpr_32, %13:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %7:vgpr_32, %13:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_store_dword_32
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+ ; GFX12-LABEL: name: buffer_store_dword_32
+ ; GFX12: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+ ; GFX12-NEXT: {{ $}}
+ ; GFX12-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GFX12-NEXT: BUFFER_STORE_DWORDX2_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+ ; GFX12-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GFX12-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
+ ; GFX12-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: BUFFER_STORE_DWORDX4_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], $sgpr_null, 16, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+ ; GFX12-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GFX12-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
+ ; GFX12-NEXT: BUFFER_STORE_DWORDX3_VBUFFER_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], $sgpr_null, 36, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+ %12:vgpr_32 = COPY $vgpr8
+ %11:vgpr_32 = COPY $vgpr7
+ %10:vgpr_32 = COPY $vgpr6
+ %9:vgpr_32 = COPY $vgpr5
+ %8:vgpr_32 = COPY $vgpr4
+ %7:vgpr_32 = COPY $vgpr3
+ %6:vgpr_32 = COPY $vgpr2
+ %5:vgpr_32 = COPY $vgpr1
+ %4:vgpr_32 = COPY $vgpr0
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %4:vgpr_32, %13:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %5:vgpr_32, %13:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %6:vgpr_32, %13:sgpr_128, $sgpr_null, 16, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %7:vgpr_32, %13:sgpr_128, $sgpr_null, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %8:vgpr_32, %13:sgpr_128, $sgpr_null, 24, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %9:vgpr_32, %13:sgpr_128, $sgpr_null, 28, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %10:vgpr_32, %13:sgpr_128, $sgpr_null, 36, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %11:vgpr_32, %13:sgpr_128, $sgpr_null, 40, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %12:vgpr_32, %13:sgpr_128, $sgpr_null, 44, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_not_merged_swizzled_0
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dword_not_merged_swizzled_0
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_not_merged_swizzled_1
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dword_not_merged_swizzled_1
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 8, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 8, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_merge_across_swizzle
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dword_merge_across_swizzle
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET]].sub0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET]].sub1
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 12, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %5:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %4:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %6:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %4:sgpr_128, $sgpr_null, 12, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %4:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_not_merge_across_swizzled_store
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dword_not_merge_across_swizzled_store
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 6, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vgpr_32 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %6:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %4:vgpr_32, %5:sgpr_128, $sgpr_null, 6, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_merge_across_swizzled_store
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dword_merge_across_swizzled_store
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_OFFSET]].sub1
+ ; GFX12-NEXT: BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 12, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vgpr_32 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %6:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ BUFFER_STORE_DWORD_VBUFFER_OFFSET_exact %4:vgpr_32, %5:sgpr_128, $sgpr_null, 12, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_OFFSET %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dword_idxen
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dword_dword_idxen
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN]].sub1
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vgpr_32 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dwordx2_idxen
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dword_dwordx2_idxen
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN]].sub1_sub2
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vgpr_32 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dwordx2_dwordx2_idxen
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2_idxen
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN]].sub0_sub1
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN]].sub2_sub3
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vgpr_32 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN %4, %5:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dwordx3_idxen
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dword_dwordx3_idxen
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN]].sub1_sub2_sub3
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vgpr_32 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN %4, %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dword_bothen
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dword_dword_bothen
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN]].sub1
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vreg_64 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dwordx2_bothen
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dword_dwordx2_bothen
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN]].sub1_sub2
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vreg_64 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dwordx2_dwordx2_bothen
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2_bothen
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub0_sub1
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub2_sub3
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vreg_64 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN %4, %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dwordx3_bothen
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dword_dwordx3_bothen
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN]].sub1_sub2_sub3
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vreg_64 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dword_idxen_exact
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dword_dword_idxen_exact
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact]].sub1
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vgpr_32 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dwordx2_idxen_exact
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dword_dwordx2_idxen_exact
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact]].sub1_sub2
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vgpr_32 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dwordx2_dwordx2_idxen_exact
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2_idxen_exact
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN_exact:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN_exact]].sub0_sub1
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN_exact]].sub2_sub3
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vgpr_32 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dwordx3_idxen_exact
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dword_dwordx3_idxen_exact
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN_exact:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN_exact]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_IDXEN_exact]].sub1_sub2_sub3
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vgpr_32 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dword_dword_idxen_exact
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dword_dword_dword_idxen_exact
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact]].sub0_sub1
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_IDXEN_exact]].sub2
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vgpr_32 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %9:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dword_dword_idxen_exact_swizzled_0
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dword_dword_dword_idxen_exact_swizzled_0
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact]].sub1
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vgpr_32 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %9:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_IDXEN_exact %4, %5:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dword_bothen_exact
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dword_dword_bothen_exact
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact]].sub1
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vreg_64 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dwordx2_bothen_exact
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dword_dwordx2_bothen_exact
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact]].sub1_sub2
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vreg_64 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dwordx2_dwordx2_bothen_exact
+body: |
+ bb.0.entry:
+
+ ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2_bothen_exact
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN_exact:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN_exact]].sub0_sub1
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN_exact]].sub2_sub3
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vreg_64 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dwordx3_bothen_exact
+body: |
+ bb.0.entry:
+
+ ; GFX12-LABEL: name: buffer_load_dword_dwordx3_bothen_exact
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN_exact:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN_exact]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_VBUFFER_BOTHEN_exact]].sub1_sub2_sub3
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vreg_64 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dword_dword_bothen_exact
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dword_dword_dword_bothen_exact
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact]].sub0_sub1
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX3_VBUFFER_BOTHEN_exact]].sub2
+ ; GFX12-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
+ ; GFX12-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vreg_64 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %9:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dword_dword_bothen_exact_swizzled_0
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dword_dword_dword_bothen_exact_swizzled_0
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact]].sub0
+ ; GFX12-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact]].sub1
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vreg_64 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %9:vgpr_32 = BUFFER_LOAD_DWORD_VBUFFER_BOTHEN_exact %4, %5:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dwordx2_dwordx2_bothen_exact_diff_vaddr
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2_bothen_exact_diff_vaddr
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr1
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact1:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vreg_64 = COPY $vgpr0
+ %5:vreg_64 = COPY $vgpr1
+ %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact %4, %6:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %9:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact %5, %6:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dwordx2_dwordx2_bothen_exact_diff_srsrc
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2_bothen_exact_diff_srsrc
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact1:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:sgpr_32 = COPY $sgpr4
+ %5:vreg_64 = COPY $vgpr0
+ %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:sgpr_128 = REG_SEQUENCE %1:sgpr_32, %subreg.sub0, %2:sgpr_32, %subreg.sub1, %3:sgpr_32, %subreg.sub2, %4:sgpr_32, %subreg.sub3
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact %5, %6:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %9:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_BOTHEN_exact %5, %7:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dwordx2_dwordx2_idxen_exact_diff_vaddr
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2_idxen_exact_diff_vaddr
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact1:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vgpr_32 = COPY $vgpr0
+ %5:vgpr_32 = COPY $vgpr1
+ %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact %4, %6:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %9:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact %5, %6:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dwordx2_dwordx2_idxen_exact_diff_srsrc
+body: |
+ bb.0.entry:
+ ; GFX12-LABEL: name: buffer_load_dwordx2_dwordx2_idxen_exact_diff_srsrc
+ ; GFX12: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX12-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX12-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX12-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX12-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+ ; GFX12-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GFX12-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX12-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ ; GFX12-NEXT: [[BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact1:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:sgpr_32 = COPY $sgpr4
+ %5:vgpr_32 = COPY $vgpr0
+ %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:sgpr_128 = REG_SEQUENCE %1:sgpr_32, %subreg.sub0, %2:sgpr_32, %subreg.sub1, %3:sgpr_32, %subreg.sub2, %4:sgpr_32, %subreg.sub3
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact %5, %6:sgpr_128, $sgpr_null, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %9:vreg_64 = BUFFER_LOAD_DWORDX2_VBUFFER_IDXEN_exact %5, %7:sgpr_128, $sgpr_null, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
diff --git a/llvm/test/CodeGen/AMDGPU/merge-buffer.mir b/llvm/test/CodeGen/AMDGPU/merge-buffer.mir
new file mode 100644
index 0000000..1c6d429
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/merge-buffer.mir
@@ -0,0 +1,1130 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 2
+# RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefixes=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefixes=GCN %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass si-load-store-opt -o - %s | FileCheck -check-prefixes=GCN %s
+
+---
+name: buffer_load_dword_dwordx3
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dword_dwordx3
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_OFFSET]].sub1_sub2_sub3
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dwordx3_dword
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dwordx3_dword
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_96 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0_sub1_sub2
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 16, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dwordx2_dwordx2
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0_sub1
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_OFFSET]].sub2_sub3
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %5:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dwordx2
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dword_dwordx2
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_OFFSET]].sub0
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_OFFSET]].sub1_sub2
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dwordx2_dword
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dwordx2_dword
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX3_OFFSET]].sub0_sub1
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX3_OFFSET]].sub2
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+
+name: buffer_load_dword_dword
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dword_dword
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub0
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_OFFSET]].sub1
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_32
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dword_32
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub0
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_OFFSET]].sub1
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_OFFSET:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_OFFSET [[REG_SEQUENCE]], 0, 16, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY [[BUFFER_LOAD_DWORDX4_OFFSET]].sub0_sub1_sub2
+ ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX4_OFFSET]].sub3
+ ; GCN-NEXT: [[COPY8:%[0-9]+]]:vreg_64 = COPY [[COPY6]].sub0_sub1
+ ; GCN-NEXT: [[COPY9:%[0-9]+]]:vgpr_32 = COPY killed [[COPY6]].sub2
+ ; GCN-NEXT: [[COPY10:%[0-9]+]]:vgpr_32 = COPY [[COPY8]].sub0
+ ; GCN-NEXT: [[COPY11:%[0-9]+]]:vgpr_32 = COPY killed [[COPY8]].sub1
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_OFFSET:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_OFFSET [[REG_SEQUENCE]], 0, 36, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY12:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX3_OFFSET]].sub0_sub1
+ ; GCN-NEXT: [[COPY13:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX3_OFFSET]].sub2
+ ; GCN-NEXT: [[COPY14:%[0-9]+]]:vgpr_32 = COPY [[COPY12]].sub0
+ ; GCN-NEXT: [[COPY15:%[0-9]+]]:vgpr_32 = COPY killed [[COPY12]].sub1
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %9:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 16, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %10:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 20, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %11:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 24, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %12:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 28, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %13:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 36, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %14:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 40, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %15:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 44, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+#
+# buffer_store_dword
+#
+
+name: buffer_store_dword_xyz
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GCN-LABEL: name: buffer_store_dword_xyz
+ ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
+ ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[REG_SEQUENCE1]], %subreg.sub1_sub2_sub3
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+ %7:vgpr_32 = COPY $vgpr3
+ %6:vgpr_32 = COPY $vgpr2
+ %5:vgpr_32 = COPY $vgpr1
+ %4:vgpr_32 = COPY $vgpr0
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2
+ BUFFER_STORE_DWORD_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ BUFFER_STORE_DWORDX3_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+...
+---
+
+name: buffer_store_dwordx3_dword
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GCN-LABEL: name: buffer_store_dwordx3_dword
+ ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY1]], %subreg.sub2
+ ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1_sub2, [[COPY]], %subreg.sub3
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+ %7:vgpr_32 = COPY $vgpr3
+ %6:vgpr_32 = COPY $vgpr2
+ %5:vgpr_32 = COPY $vgpr1
+ %4:vgpr_32 = COPY $vgpr0
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %14:vreg_96 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1, %6:vgpr_32, %subreg.sub2
+ BUFFER_STORE_DWORDX3_OFFSET_exact %14:vreg_96, %13:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+ BUFFER_STORE_DWORD_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 16, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_store_dwordx2_dwordx2
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GCN-LABEL: name: buffer_store_dwordx2_dwordx2
+ ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_128 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[REG_SEQUENCE2]], %subreg.sub2_sub3
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[REG_SEQUENCE3]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+ %7:vgpr_32 = COPY $vgpr3
+ %6:vgpr_32 = COPY $vgpr2
+ %5:vgpr_32 = COPY $vgpr1
+ %4:vgpr_32 = COPY $vgpr0
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
+ %15:vreg_64 = REG_SEQUENCE %6:vgpr_32, %subreg.sub0, %7:vgpr_32, %subreg.sub1
+ BUFFER_STORE_DWORDX2_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+ BUFFER_STORE_DWORDX2_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+...
+---
+
+name: buffer_store_dword_dwordx2
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GCN-LABEL: name: buffer_store_dword_dwordx2
+ ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[COPY]], %subreg.sub0, %10:vreg_64, %subreg.sub1_sub2
+ ; GCN-NEXT: BUFFER_STORE_DWORDX3_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+ %7:vgpr_32 = COPY $vgpr3
+ %6:vgpr_32 = COPY $vgpr2
+ %5:vgpr_32 = COPY $vgpr1
+ %4:vgpr_32 = COPY $vgpr0
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
+ BUFFER_STORE_DWORD_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ BUFFER_STORE_DWORDX2_OFFSET_exact %15:vreg_64, %13:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+...
+---
+
+name: buffer_store_dwordx2_dword
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GCN-LABEL: name: buffer_store_dwordx2_dword
+ ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+ ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_96 = REG_SEQUENCE [[REG_SEQUENCE1]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
+ ; GCN-NEXT: BUFFER_STORE_DWORDX3_OFFSET_exact killed [[REG_SEQUENCE2]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+ %7:vgpr_32 = COPY $vgpr3
+ %6:vgpr_32 = COPY $vgpr2
+ %5:vgpr_32 = COPY $vgpr1
+ %4:vgpr_32 = COPY $vgpr0
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %14:vreg_64 = REG_SEQUENCE %4:vgpr_32, %subreg.sub0, %5:vgpr_32, %subreg.sub1
+ BUFFER_STORE_DWORDX2_OFFSET_exact %14:vreg_64, %13:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+ BUFFER_STORE_DWORD_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_store_dword_dword
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GCN-LABEL: name: buffer_store_dword_dword
+ ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY7:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY7]], %subreg.sub0, [[COPY6]], %subreg.sub1, [[COPY5]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+ ; GCN-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+ %7:vgpr_32 = COPY $vgpr3
+ %6:vgpr_32 = COPY $vgpr2
+ %5:vgpr_32 = COPY $vgpr1
+ %4:vgpr_32 = COPY $vgpr0
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ BUFFER_STORE_DWORD_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ BUFFER_STORE_DWORD_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_store_dword_32
+body: |
+ bb.0.entry:
+ liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+ ; GCN-LABEL: name: buffer_store_dword_32
+ ; GCN: liveins: $sgpr0, $sgpr1, $sgpr2, $sgpr3, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8
+ ; GCN-NEXT: {{ $}}
+ ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr7
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr6
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr5
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr4
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+ ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GCN-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[COPY9:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY10:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY11:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY12:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY12]], %subreg.sub0, [[COPY11]], %subreg.sub1, [[COPY10]], %subreg.sub2, [[COPY9]], %subreg.sub3
+ ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY8]], %subreg.sub0, [[COPY7]], %subreg.sub1
+ ; GCN-NEXT: BUFFER_STORE_DWORDX2_OFFSET_exact killed [[REG_SEQUENCE1]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s64), align 1, addrspace 4)
+ ; GCN-NEXT: [[REG_SEQUENCE2:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY6]], %subreg.sub0, [[COPY5]], %subreg.sub1
+ ; GCN-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE2]], %subreg.sub0_sub1, [[COPY4]], %subreg.sub2
+ ; GCN-NEXT: [[REG_SEQUENCE4:%[0-9]+]]:vreg_128 = REG_SEQUENCE killed [[REG_SEQUENCE3]], %subreg.sub0_sub1_sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: BUFFER_STORE_DWORDX4_OFFSET_exact killed [[REG_SEQUENCE4]], [[REG_SEQUENCE]], 0, 16, 0, 0, implicit $exec :: (dereferenceable store (s128), align 1, addrspace 4)
+ ; GCN-NEXT: [[REG_SEQUENCE5:%[0-9]+]]:vreg_64 = REG_SEQUENCE [[COPY2]], %subreg.sub0, [[COPY1]], %subreg.sub1
+ ; GCN-NEXT: [[REG_SEQUENCE6:%[0-9]+]]:vreg_96 = REG_SEQUENCE killed [[REG_SEQUENCE5]], %subreg.sub0_sub1, [[COPY]], %subreg.sub2
+ ; GCN-NEXT: BUFFER_STORE_DWORDX3_OFFSET_exact killed [[REG_SEQUENCE6]], [[REG_SEQUENCE]], 0, 36, 0, 0, implicit $exec :: (dereferenceable store (s96), align 1, addrspace 4)
+ %12:vgpr_32 = COPY $vgpr8
+ %11:vgpr_32 = COPY $vgpr7
+ %10:vgpr_32 = COPY $vgpr6
+ %9:vgpr_32 = COPY $vgpr5
+ %8:vgpr_32 = COPY $vgpr4
+ %7:vgpr_32 = COPY $vgpr3
+ %6:vgpr_32 = COPY $vgpr2
+ %5:vgpr_32 = COPY $vgpr1
+ %4:vgpr_32 = COPY $vgpr0
+ %3:sgpr_32 = COPY $sgpr3
+ %2:sgpr_32 = COPY $sgpr2
+ %1:sgpr_32 = COPY $sgpr1
+ %0:sgpr_32 = COPY $sgpr0
+ %13:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ BUFFER_STORE_DWORD_OFFSET_exact %4:vgpr_32, %13:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ BUFFER_STORE_DWORD_OFFSET_exact %5:vgpr_32, %13:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ BUFFER_STORE_DWORD_OFFSET_exact %6:vgpr_32, %13:sgpr_128, 0, 16, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ BUFFER_STORE_DWORD_OFFSET_exact %7:vgpr_32, %13:sgpr_128, 0, 20, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ BUFFER_STORE_DWORD_OFFSET_exact %8:vgpr_32, %13:sgpr_128, 0, 24, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ BUFFER_STORE_DWORD_OFFSET_exact %9:vgpr_32, %13:sgpr_128, 0, 28, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ BUFFER_STORE_DWORD_OFFSET_exact %10:vgpr_32, %13:sgpr_128, 0, 36, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ BUFFER_STORE_DWORD_OFFSET_exact %11:vgpr_32, %13:sgpr_128, 0, 40, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ BUFFER_STORE_DWORD_OFFSET_exact %12:vgpr_32, %13:sgpr_128, 0, 44, 0, 0, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_not_merged_swizzled_0
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dword_not_merged_swizzled_0
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_not_merged_swizzled_1
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dword_not_merged_swizzled_1
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], 0, 8, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 8, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_merge_across_swizzle
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dword_merge_across_swizzle
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_OFFSET:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_OFFSET]].sub0
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_OFFSET]].sub1
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], 0, 12, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %4:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %4:sgpr_128, 0, 12, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %4:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_merge_across_swizzled_store
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dword_merge_across_swizzled_store
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET_exact [[COPY4]], [[REG_SEQUENCE]], 0, 6, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORD_OFFSET1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET [[REG_SEQUENCE]], 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vgpr_32 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %6:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ BUFFER_STORE_DWORD_OFFSET_exact %4:vgpr_32, %5:sgpr_128, 0, 6, 0, 1, implicit $exec :: (dereferenceable store (s32), align 1, addrspace 4)
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dword_idxen
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dword_dword_idxen
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_IDXEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_IDXEN]].sub0
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_IDXEN]].sub1
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vgpr_32 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dwordx2_idxen
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dword_dwordx2_idxen
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_IDXEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_IDXEN]].sub0
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_IDXEN]].sub1_sub2
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vgpr_32 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dwordx2_dwordx2_idxen
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2_idxen
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_IDXEN]].sub0_sub1
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_IDXEN]].sub2_sub3
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vgpr_32 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN %4, %5:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dwordx3_idxen
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dword_dwordx3_idxen
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_IDXEN]].sub0
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_IDXEN]].sub1_sub2_sub3
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vgpr_32 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_96 = BUFFER_LOAD_DWORDX3_IDXEN %4, %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dword_bothen
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dword_dword_bothen
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_BOTHEN:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_BOTHEN]].sub0
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_BOTHEN]].sub1
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vreg_64 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dwordx2_bothen
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dword_dwordx2_bothen
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_BOTHEN:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_BOTHEN]].sub0
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_BOTHEN]].sub1_sub2
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vreg_64 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dwordx2_dwordx2_bothen
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2_bothen
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_BOTHEN]].sub0_sub1
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_BOTHEN]].sub2_sub3
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vreg_64 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN %4, %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dwordx3_bothen
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dword_dwordx3_bothen
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_BOTHEN]].sub0
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_BOTHEN]].sub1_sub2_sub3
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vreg_64 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_96 = BUFFER_LOAD_DWORDX3_BOTHEN %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dword_idxen_exact
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dword_dword_idxen_exact
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_IDXEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_IDXEN_exact]].sub0
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_IDXEN_exact]].sub1
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vgpr_32 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dwordx2_idxen_exact
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dword_dwordx2_idxen_exact
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_IDXEN_exact:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_IDXEN_exact]].sub0
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_IDXEN_exact]].sub1_sub2
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vgpr_32 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dwordx2_dwordx2_idxen_exact
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2_idxen_exact
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN_exact:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_IDXEN_exact]].sub0_sub1
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_IDXEN_exact]].sub2_sub3
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vgpr_32 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact %4, %5:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dwordx3_idxen_exact
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dword_dwordx3_idxen_exact
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_IDXEN_exact:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_IDXEN_exact]].sub0
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_IDXEN_exact]].sub1_sub2_sub3
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vgpr_32 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_96 = BUFFER_LOAD_DWORDX3_IDXEN_exact %4, %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dword_dword_idxen_exact
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dword_dword_dword_idxen_exact
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_IDXEN_exact:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX3_IDXEN_exact]].sub0_sub1
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX3_IDXEN_exact]].sub2
+ ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
+ ; GCN-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vgpr_32 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %9:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dword_dword_idxen_exact_swizzled_0
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dword_dword_dword_idxen_exact_swizzled_0
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORD_IDXEN_exact:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_IDXEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_IDXEN_exact]].sub0
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_IDXEN_exact]].sub1
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vgpr_32 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %9:vgpr_32 = BUFFER_LOAD_DWORD_IDXEN_exact %4, %5:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dword_bothen_exact
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dword_dword_bothen_exact
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_BOTHEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_BOTHEN_exact]].sub0
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_BOTHEN_exact]].sub1
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vreg_64 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dwordx2_bothen_exact
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dword_dwordx2_bothen_exact
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_BOTHEN_exact:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX3_BOTHEN_exact]].sub0
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX3_BOTHEN_exact]].sub1_sub2
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vreg_64 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dwordx2_dwordx2_bothen_exact
+body: |
+ bb.0.entry:
+
+ ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2_bothen_exact
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN_exact:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX4_BOTHEN_exact]].sub0_sub1
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_64 = COPY killed [[BUFFER_LOAD_DWORDX4_BOTHEN_exact]].sub2_sub3
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vreg_64 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dwordx3_bothen_exact
+body: |
+ bb.0.entry:
+
+ ; GCN-LABEL: name: buffer_load_dword_dwordx3_bothen_exact
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX4_BOTHEN_exact:%[0-9]+]]:vreg_128 = BUFFER_LOAD_DWORDX4_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX4_BOTHEN_exact]].sub0
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:vreg_96 = COPY killed [[BUFFER_LOAD_DWORDX4_BOTHEN_exact]].sub1_sub2_sub3
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vreg_64 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 0, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vreg_96 = BUFFER_LOAD_DWORDX3_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dword_dword_bothen_exact
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dword_dword_dword_bothen_exact
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX3_BOTHEN_exact:%[0-9]+]]:vreg_96 = BUFFER_LOAD_DWORDX3_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY [[BUFFER_LOAD_DWORDX3_BOTHEN_exact]].sub0_sub1
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX3_BOTHEN_exact]].sub2
+ ; GCN-NEXT: [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[COPY5]].sub0
+ ; GCN-NEXT: [[COPY8:%[0-9]+]]:vgpr_32 = COPY killed [[COPY5]].sub1
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vreg_64 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %9:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dword_dword_dword_bothen_exact_swizzled_0
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dword_dword_dword_bothen_exact_swizzled_0
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORD_BOTHEN_exact:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_BOTHEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s64), align 1, addrspace 4)
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY [[BUFFER_LOAD_DWORDX2_BOTHEN_exact]].sub0
+ ; GCN-NEXT: [[COPY6:%[0-9]+]]:vgpr_32 = COPY killed [[BUFFER_LOAD_DWORDX2_BOTHEN_exact]].sub1
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vreg_64 = COPY $vgpr0
+ %5:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 4, 0, 1, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %8:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 8, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %9:vgpr_32 = BUFFER_LOAD_DWORD_BOTHEN_exact %4, %5:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dwordx2_dwordx2_bothen_exact_diff_vaddr
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2_bothen_exact_diff_vaddr
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vreg_64 = COPY $vgpr0
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr1
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_BOTHEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_BOTHEN_exact1:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vreg_64 = COPY $vgpr0
+ %5:vreg_64 = COPY $vgpr1
+ %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact %4, %6:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %9:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact %5, %6:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dwordx2_dwordx2_bothen_exact_diff_srsrc
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2_bothen_exact_diff_srsrc
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vreg_64 = COPY $vgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_BOTHEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_BOTHEN_exact1:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:sgpr_32 = COPY $sgpr4
+ %5:vreg_64 = COPY $vgpr0
+ %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:sgpr_128 = REG_SEQUENCE %1:sgpr_32, %subreg.sub0, %2:sgpr_32, %subreg.sub1, %3:sgpr_32, %subreg.sub2, %4:sgpr_32, %subreg.sub3
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact %5, %6:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %9:vreg_64 = BUFFER_LOAD_DWORDX2_BOTHEN_exact %5, %7:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dwordx2_dwordx2_idxen_exact_diff_vaddr
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2_idxen_exact_diff_vaddr
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_IDXEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact [[COPY4]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_IDXEN_exact1:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:vgpr_32 = COPY $vgpr0
+ %5:vgpr_32 = COPY $vgpr1
+ %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact %4, %6:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %9:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact %5, %6:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
+---
+
+name: buffer_load_dwordx2_dwordx2_idxen_exact_diff_srsrc
+body: |
+ bb.0.entry:
+ ; GCN-LABEL: name: buffer_load_dwordx2_dwordx2_idxen_exact_diff_srsrc
+ ; GCN: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GCN-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GCN-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GCN-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GCN-NEXT: [[COPY4:%[0-9]+]]:sgpr_32 = COPY $sgpr4
+ ; GCN-NEXT: [[COPY5:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+ ; GCN-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GCN-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY2]], %subreg.sub1, [[COPY3]], %subreg.sub2, [[COPY4]], %subreg.sub3
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_IDXEN_exact:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact [[COPY5]], [[REG_SEQUENCE]], 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ ; GCN-NEXT: [[BUFFER_LOAD_DWORDX2_IDXEN_exact1:%[0-9]+]]:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact [[COPY5]], [[REG_SEQUENCE1]], 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %0:sgpr_32 = COPY $sgpr0
+ %1:sgpr_32 = COPY $sgpr1
+ %2:sgpr_32 = COPY $sgpr2
+ %3:sgpr_32 = COPY $sgpr3
+ %4:sgpr_32 = COPY $sgpr4
+ %5:vgpr_32 = COPY $vgpr0
+ %6:sgpr_128 = REG_SEQUENCE %0:sgpr_32, %subreg.sub0, %1:sgpr_32, %subreg.sub1, %2:sgpr_32, %subreg.sub2, %3:sgpr_32, %subreg.sub3
+ %7:sgpr_128 = REG_SEQUENCE %1:sgpr_32, %subreg.sub0, %2:sgpr_32, %subreg.sub1, %3:sgpr_32, %subreg.sub2, %4:sgpr_32, %subreg.sub3
+ %8:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact %5, %6:sgpr_128, 0, 4, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ %9:vreg_64 = BUFFER_LOAD_DWORDX2_IDXEN_exact %5, %7:sgpr_128, 0, 12, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+...
diff --git a/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir b/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir
index c86b5ad..9766b42 100644
--- a/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir
+++ b/llvm/test/CodeGen/AMDGPU/merge-tbuffer.mir
@@ -7,9 +7,37 @@
# GFX9 tests
#
+---
name: gfx9_tbuffer_load_x_xyz
body: |
bb.0.entry:
+ ; GFX9-LABEL: name: gfx9_tbuffer_load_x_xyz
+ ; GFX9: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX9-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX9-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX9-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX9-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX9-NEXT: [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET:%[0-9]+]]:vreg_128 = TBUFFER_LOAD_FORMAT_XYZW_OFFSET [[REG_SEQUENCE]], 0, 4, 126, 0, 0, implicit $exec :: (dereferenceable load (s128), align 1, addrspace 4)
+ ; GFX9-NEXT: [[COPY4:%[0-9]+]]:vgpr_32 = COPY [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub0
+ ; GFX9-NEXT: [[COPY5:%[0-9]+]]:vreg_96 = COPY killed [[TBUFFER_LOAD_FORMAT_XYZW_OFFSET]].sub1_sub2_sub3
+ ;
+ ; GFX10-LABEL: name: gfx9_tbuffer_load_x_xyz
+ ; GFX10: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX10-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX10-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX10-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX10-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ ; GFX10-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
+ ;
+ ; GFX11-LABEL: name: gfx9_tbuffer_load_x_xyz
+ ; GFX11: [[COPY:%[0-9]+]]:sgpr_32 = COPY $sgpr0
+ ; GFX11-NEXT: [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr1
+ ; GFX11-NEXT: [[COPY2:%[0-9]+]]:sgpr_32 = COPY $sgpr2
+ ; GFX11-NEXT: [[COPY3:%[0-9]+]]:sgpr_32 = COPY $sgpr3
+ ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:sgpr_128 = REG_SEQUENCE [[COPY]], %subreg.sub0, [[COPY1]], %subreg.sub1, [[COPY2]], %subreg.sub2, [[COPY3]], %subreg.sub3
+ ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_X_OFFSET:%[0-9]+]]:vgpr_32 = TBUFFER_LOAD_FORMAT_X_OFFSET [[REG_SEQUENCE]], 0, 4, 116, 0, 0, implicit $exec :: (dereferenceable load (s32), align 1, addrspace 4)
+ ; GFX11-NEXT: [[TBUFFER_LOAD_FORMAT_XYZ_OFFSET:%[0-9]+]]:vreg_96 = TBUFFER_LOAD_FORMAT_XYZ_OFFSET [[REG_SEQUENCE]], 0, 8, 125, 0, 0, implicit $exec :: (dereferenceable load (s96), align 1, addrspace 4)
%0:sgpr_32 = COPY $sgpr0
%1:sgpr_32 = COPY $sgpr1
%2:sgpr_32 = COPY $sgpr2
diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
index cbdc7bb..69971bc 100644
--- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
+++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll
@@ -27,7 +27,7 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 {
; CHECK-LABEL: csr_vgpr_spill_fp_callee:
; CHECK: ; %bb.0: ; %bb
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_mov_b32 s24, s33
+; CHECK-NEXT: s_mov_b32 s18, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill
@@ -43,7 +43,6 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 {
; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21]
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
-; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: ;;#ASMSTART
; CHECK-NEXT: ; clobber csr v40
@@ -55,7 +54,7 @@ define internal fastcc void @csr_vgpr_spill_fp_callee() #0 {
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00
-; CHECK-NEXT: s_mov_b32 s33, s24
+; CHECK-NEXT: s_mov_b32 s33, s18
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
bb:
@@ -88,7 +87,6 @@ define amdgpu_kernel void @kernel_call() {
; CHECK-NEXT: ; implicit-def: $sgpr15
; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21]
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
-; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_endpgm
bb:
@@ -148,7 +146,6 @@ define amdgpu_kernel void @kernel_tailcall() {
; CHECK-NEXT: ; implicit-def: $sgpr15
; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21]
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
-; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_endpgm
bb:
@@ -173,7 +170,7 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 {
; CHECK-LABEL: caller_save_vgpr_spill_fp_tail_call:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_mov_b32 s24, s33
+; CHECK-NEXT: s_mov_b32 s18, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1
; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -188,7 +185,6 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 {
; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21]
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
-; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_readlane_b32 s31, v1, 1
; CHECK-NEXT: v_readlane_b32 s30, v1, 0
@@ -196,7 +192,7 @@ define hidden i32 @caller_save_vgpr_spill_fp_tail_call() #0 {
; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00
-; CHECK-NEXT: s_mov_b32 s33, s24
+; CHECK-NEXT: s_mov_b32 s33, s18
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -208,7 +204,7 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 {
; CHECK-LABEL: caller_save_vgpr_spill_fp:
; CHECK: ; %bb.0: ; %entry
; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; CHECK-NEXT: s_mov_b32 s25, s33
+; CHECK-NEXT: s_mov_b32 s19, s33
; CHECK-NEXT: s_mov_b32 s33, s32
; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1
; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill
@@ -223,7 +219,6 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 {
; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21]
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
-; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: v_readlane_b32 s31, v2, 1
; CHECK-NEXT: v_readlane_b32 s30, v2, 0
@@ -231,7 +226,7 @@ define hidden i32 @caller_save_vgpr_spill_fp() #0 {
; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload
; CHECK-NEXT: s_mov_b64 exec, s[4:5]
; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00
-; CHECK-NEXT: s_mov_b32 s33, s25
+; CHECK-NEXT: s_mov_b32 s33, s19
; CHECK-NEXT: s_waitcnt vmcnt(0)
; CHECK-NEXT: s_setpc_b64 s[30:31]
entry:
@@ -263,7 +258,6 @@ define protected amdgpu_kernel void @kernel() {
; CHECK-NEXT: ; implicit-def: $sgpr15
; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21]
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
-; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_endpgm
entry:
diff --git a/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir b/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir
index 3de258b..bf2cf6a 100644
--- a/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir
+++ b/llvm/test/CodeGen/AMDGPU/neighboring-mfma-padding.mir
@@ -5,6 +5,14 @@
# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -amdgpu-mfma-padding-ratio=75 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=gfx908-PAD75 %s
# RUN: llc -mtriple=amdgcn -mcpu=gfx908 -amdgpu-mfma-padding-ratio=100 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=gfx908-PAD100 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=gfx90a-DEFAULT %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-padding-ratio=50 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=gfx90a-PAD50 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx90a -amdgpu-mfma-padding-ratio=100 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=gfx90a-PAD100 %s
+
+# RUN: llc -mtriple=amdgcn -mcpu=gfx940 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=gfx940-DEFAULT %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-mfma-padding-ratio=50 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=gfx940-PAD50 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-mfma-padding-ratio=100 -verify-machineinstrs -run-pass post-RA-hazard-rec %s -o - | FileCheck -check-prefix=gfx940-PAD100 %s
+
---
name: mfma_padding_2_pass
body: |
@@ -31,6 +39,35 @@ body: |
; gfx908-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
; gfx908-PAD100-NEXT: S_NOP 1
; gfx908-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-DEFAULT-LABEL: name: mfma_padding_2_pass
+ ; gfx90a-DEFAULT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-PAD50-LABEL: name: mfma_padding_2_pass
+ ; gfx90a-PAD50: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-PAD50-NEXT: S_NOP 0
+ ; gfx90a-PAD50-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-PAD100-LABEL: name: mfma_padding_2_pass
+ ; gfx90a-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-PAD100-NEXT: S_NOP 1
+ ; gfx90a-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-DEFAULT-LABEL: name: mfma_padding_2_pass
+ ; gfx940-DEFAULT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-DEFAULT-NEXT: S_NOP 1
+ ; gfx940-DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-PAD50-LABEL: name: mfma_padding_2_pass
+ ; gfx940-PAD50: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-PAD50-NEXT: S_NOP 1
+ ; gfx940-PAD50-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-PAD100-LABEL: name: mfma_padding_2_pass
+ ; gfx940-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-PAD100-NEXT: S_NOP 1
+ ; gfx940-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
$agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
$agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
...
@@ -64,6 +101,40 @@ body: |
; gfx908-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
; gfx908-PAD100-NEXT: S_NOP 0
; gfx908-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-DEFAULT-LABEL: name: mfma_padding_2_pass_1_intervening_valu
+ ; gfx90a-DEFAULT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-PAD50-LABEL: name: mfma_padding_2_pass_1_intervening_valu
+ ; gfx90a-PAD50: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD50-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-PAD100-LABEL: name: mfma_padding_2_pass_1_intervening_valu
+ ; gfx90a-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD100-NEXT: S_NOP 0
+ ; gfx90a-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-DEFAULT-LABEL: name: mfma_padding_2_pass_1_intervening_valu
+ ; gfx940-DEFAULT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-DEFAULT-NEXT: S_NOP 0
+ ; gfx940-DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-PAD50-LABEL: name: mfma_padding_2_pass_1_intervening_valu
+ ; gfx940-PAD50: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD50-NEXT: S_NOP 0
+ ; gfx940-PAD50-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-PAD100-LABEL: name: mfma_padding_2_pass_1_intervening_valu
+ ; gfx940-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD100-NEXT: S_NOP 0
+ ; gfx940-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
$agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
$agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
@@ -100,6 +171,41 @@ body: |
; gfx908-PAD100-NEXT: DBG_VALUE
; gfx908-PAD100-NEXT: S_NOP 1
; gfx908-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-DEFAULT-LABEL: name: mfma_padding_2_pass_dbg
+ ; gfx90a-DEFAULT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: DBG_VALUE
+ ; gfx90a-DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-PAD50-LABEL: name: mfma_padding_2_pass_dbg
+ ; gfx90a-PAD50: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-PAD50-NEXT: DBG_VALUE
+ ; gfx90a-PAD50-NEXT: S_NOP 0
+ ; gfx90a-PAD50-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-PAD100-LABEL: name: mfma_padding_2_pass_dbg
+ ; gfx90a-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-PAD100-NEXT: DBG_VALUE
+ ; gfx90a-PAD100-NEXT: S_NOP 1
+ ; gfx90a-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-DEFAULT-LABEL: name: mfma_padding_2_pass_dbg
+ ; gfx940-DEFAULT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-DEFAULT-NEXT: DBG_VALUE
+ ; gfx940-DEFAULT-NEXT: S_NOP 1
+ ; gfx940-DEFAULT-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-PAD50-LABEL: name: mfma_padding_2_pass_dbg
+ ; gfx940-PAD50: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-PAD50-NEXT: DBG_VALUE
+ ; gfx940-PAD50-NEXT: S_NOP 1
+ ; gfx940-PAD50-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-PAD100-LABEL: name: mfma_padding_2_pass_dbg
+ ; gfx940-PAD100: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-PAD100-NEXT: DBG_VALUE
+ ; gfx940-PAD100-NEXT: S_NOP 1
+ ; gfx940-PAD100-NEXT: $agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
$agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
DBG_VALUE
$agpr0_agpr1_agpr2_agpr3 = V_MFMA_F32_4X4X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3, 0, 0, 0, implicit $mode, implicit $exec
@@ -132,6 +238,34 @@ body: |
; gfx908-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; gfx908-PAD100-NEXT: S_NOP 7
; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-DEFAULT-LABEL: name: mfma_padding_8_pass
+ ; gfx90a-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-PAD50-LABEL: name: mfma_padding_8_pass
+ ; gfx90a-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-PAD50-NEXT: S_NOP 3
+ ; gfx90a-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-PAD100-LABEL: name: mfma_padding_8_pass
+ ; gfx90a-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-PAD100-NEXT: S_NOP 7
+ ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-DEFAULT-LABEL: name: mfma_padding_8_pass
+ ; gfx940-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-PAD50-LABEL: name: mfma_padding_8_pass
+ ; gfx940-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-PAD50-NEXT: S_NOP 3
+ ; gfx940-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-PAD100-LABEL: name: mfma_padding_8_pass
+ ; gfx940-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-PAD100-NEXT: S_NOP 7
+ ; gfx940-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
$agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
$agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
...
@@ -172,6 +306,46 @@ body: |
; gfx908-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
; gfx908-PAD100-NEXT: S_NOP 5
; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-DEFAULT-LABEL: name: mfma_padding_8_pass_2_intervening_valu
+ ; gfx90a-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-PAD50-LABEL: name: mfma_padding_8_pass_2_intervening_valu
+ ; gfx90a-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD50-NEXT: S_NOP 1
+ ; gfx90a-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-PAD100-LABEL: name: mfma_padding_8_pass_2_intervening_valu
+ ; gfx90a-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD100-NEXT: S_NOP 5
+ ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-DEFAULT-LABEL: name: mfma_padding_8_pass_2_intervening_valu
+ ; gfx940-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-PAD50-LABEL: name: mfma_padding_8_pass_2_intervening_valu
+ ; gfx940-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD50-NEXT: S_NOP 1
+ ; gfx940-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-PAD100-LABEL: name: mfma_padding_8_pass_2_intervening_valu
+ ; gfx940-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD100-NEXT: S_NOP 5
+ ; gfx940-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
$agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_16X16X1F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
$vgpr3 = V_MOV_B32_e32 1, implicit $exec
@@ -207,6 +381,36 @@ body: |
; gfx908-PAD100-NEXT: S_NOP 7
; gfx908-PAD100-NEXT: S_NOP 7
; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass
+ ; gfx90a-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-PAD50-LABEL: name: mfma_padding_16_pass
+ ; gfx90a-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-PAD50-NEXT: S_NOP 7
+ ; gfx90a-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-PAD100-LABEL: name: mfma_padding_16_pass
+ ; gfx90a-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-PAD100-NEXT: S_NOP 7
+ ; gfx90a-PAD100-NEXT: S_NOP 7
+ ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-DEFAULT-LABEL: name: mfma_padding_16_pass
+ ; gfx940-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-PAD50-LABEL: name: mfma_padding_16_pass
+ ; gfx940-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-PAD50-NEXT: S_NOP 7
+ ; gfx940-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-PAD100-LABEL: name: mfma_padding_16_pass
+ ; gfx940-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-PAD100-NEXT: S_NOP 7
+ ; gfx940-PAD100-NEXT: S_NOP 7
+ ; gfx940-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
$agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
$agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
...
@@ -258,6 +462,60 @@ body: |
; gfx908-PAD100-NEXT: S_NOP 7
; gfx908-PAD100-NEXT: S_NOP 3
; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass_4_intervening_valu
+ ; gfx90a-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-PAD50-LABEL: name: mfma_padding_16_pass_4_intervening_valu
+ ; gfx90a-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD50-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD50-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD50-NEXT: S_NOP 3
+ ; gfx90a-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-PAD100-LABEL: name: mfma_padding_16_pass_4_intervening_valu
+ ; gfx90a-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD100-NEXT: S_NOP 7
+ ; gfx90a-PAD100-NEXT: S_NOP 3
+ ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-DEFAULT-LABEL: name: mfma_padding_16_pass_4_intervening_valu
+ ; gfx940-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-PAD50-LABEL: name: mfma_padding_16_pass_4_intervening_valu
+ ; gfx940-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD50-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD50-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD50-NEXT: S_NOP 3
+ ; gfx940-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-PAD100-LABEL: name: mfma_padding_16_pass_4_intervening_valu
+ ; gfx940-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD100-NEXT: S_NOP 7
+ ; gfx940-PAD100-NEXT: S_NOP 3
+ ; gfx940-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
$agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
$vgpr3 = V_MOV_B32_e32 1, implicit $exec
@@ -369,6 +627,126 @@ body: |
; gfx908-PAD100-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec
; gfx908-PAD100-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec
; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass_16_intervening_valu
+ ; gfx90a-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: $vgpr7 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: $vgpr10 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: $vgpr11 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: $vgpr12 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: $vgpr13 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: $vgpr14 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: $vgpr15 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-PAD50-LABEL: name: mfma_padding_16_pass_16_intervening_valu
+ ; gfx90a-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD50-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD50-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD50-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD50-NEXT: $vgpr7 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD50-NEXT: $vgpr8 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD50-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD50-NEXT: $vgpr10 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD50-NEXT: $vgpr11 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD50-NEXT: $vgpr12 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD50-NEXT: $vgpr13 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD50-NEXT: $vgpr14 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD50-NEXT: $vgpr15 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD50-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD50-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-PAD100-LABEL: name: mfma_padding_16_pass_16_intervening_valu
+ ; gfx90a-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD100-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD100-NEXT: $vgpr7 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD100-NEXT: $vgpr8 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD100-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD100-NEXT: $vgpr10 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD100-NEXT: $vgpr11 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD100-NEXT: $vgpr12 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD100-NEXT: $vgpr13 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD100-NEXT: $vgpr14 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD100-NEXT: $vgpr15 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD100-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD100-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-DEFAULT-LABEL: name: mfma_padding_16_pass_16_intervening_valu
+ ; gfx940-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-DEFAULT-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-DEFAULT-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-DEFAULT-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-DEFAULT-NEXT: $vgpr7 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-DEFAULT-NEXT: $vgpr8 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-DEFAULT-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-DEFAULT-NEXT: $vgpr10 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-DEFAULT-NEXT: $vgpr11 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-DEFAULT-NEXT: $vgpr12 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-DEFAULT-NEXT: $vgpr13 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-DEFAULT-NEXT: $vgpr14 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-DEFAULT-NEXT: $vgpr15 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-DEFAULT-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-DEFAULT-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-PAD50-LABEL: name: mfma_padding_16_pass_16_intervening_valu
+ ; gfx940-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD50-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD50-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD50-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD50-NEXT: $vgpr7 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD50-NEXT: $vgpr8 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD50-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD50-NEXT: $vgpr10 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD50-NEXT: $vgpr11 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD50-NEXT: $vgpr12 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD50-NEXT: $vgpr13 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD50-NEXT: $vgpr14 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD50-NEXT: $vgpr15 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD50-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD50-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-PAD100-LABEL: name: mfma_padding_16_pass_16_intervening_valu
+ ; gfx940-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD100-NEXT: $vgpr4 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD100-NEXT: $vgpr5 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD100-NEXT: $vgpr6 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD100-NEXT: $vgpr7 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD100-NEXT: $vgpr8 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD100-NEXT: $vgpr9 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD100-NEXT: $vgpr10 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD100-NEXT: $vgpr11 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD100-NEXT: $vgpr12 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD100-NEXT: $vgpr13 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD100-NEXT: $vgpr14 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD100-NEXT: $vgpr15 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD100-NEXT: $vgpr16 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD100-NEXT: $vgpr17 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
$agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
$vgpr2 = V_MOV_B32_e32 1, implicit $exec
$vgpr3 = V_MOV_B32_e32 1, implicit $exec
@@ -414,6 +792,30 @@ body: |
; gfx908-PAD100-LABEL: name: mfma_padding_16_pass_occ_1
; gfx908-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass_occ_1
+ ; gfx90a-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-PAD50-LABEL: name: mfma_padding_16_pass_occ_1
+ ; gfx90a-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-PAD100-LABEL: name: mfma_padding_16_pass_occ_1
+ ; gfx90a-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-DEFAULT-LABEL: name: mfma_padding_16_pass_occ_1
+ ; gfx940-DEFAULT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-PAD50-LABEL: name: mfma_padding_16_pass_occ_1
+ ; gfx940-PAD50: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-PAD100-LABEL: name: mfma_padding_16_pass_occ_1
+ ; gfx940-PAD100: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
$agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
$agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
...
@@ -506,6 +908,108 @@ body: |
; gfx908-PAD100-NEXT: S_NOP 7
; gfx908-PAD100-NEXT: S_NOP 5
; gfx908-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-DEFAULT-LABEL: name: mfma_padding_16_pass_2_preds
+ ; gfx90a-DEFAULT: bb.0:
+ ; gfx90a-DEFAULT-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; gfx90a-DEFAULT-NEXT: {{ $}}
+ ; gfx90a-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: S_CBRANCH_VCCZ %bb.2, implicit undef $vcc
+ ; gfx90a-DEFAULT-NEXT: {{ $}}
+ ; gfx90a-DEFAULT-NEXT: bb.1:
+ ; gfx90a-DEFAULT-NEXT: successors: %bb.2(0x80000000)
+ ; gfx90a-DEFAULT-NEXT: {{ $}}
+ ; gfx90a-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: {{ $}}
+ ; gfx90a-DEFAULT-NEXT: bb.2:
+ ; gfx90a-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-PAD50-LABEL: name: mfma_padding_16_pass_2_preds
+ ; gfx90a-PAD50: bb.0:
+ ; gfx90a-PAD50-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; gfx90a-PAD50-NEXT: {{ $}}
+ ; gfx90a-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-PAD50-NEXT: S_CBRANCH_VCCZ %bb.2, implicit undef $vcc
+ ; gfx90a-PAD50-NEXT: {{ $}}
+ ; gfx90a-PAD50-NEXT: bb.1:
+ ; gfx90a-PAD50-NEXT: successors: %bb.2(0x80000000)
+ ; gfx90a-PAD50-NEXT: {{ $}}
+ ; gfx90a-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD50-NEXT: {{ $}}
+ ; gfx90a-PAD50-NEXT: bb.2:
+ ; gfx90a-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD50-NEXT: S_NOP 5
+ ; gfx90a-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx90a-PAD100-LABEL: name: mfma_padding_16_pass_2_preds
+ ; gfx90a-PAD100: bb.0:
+ ; gfx90a-PAD100-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; gfx90a-PAD100-NEXT: {{ $}}
+ ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx90a-PAD100-NEXT: S_CBRANCH_VCCZ %bb.2, implicit undef $vcc
+ ; gfx90a-PAD100-NEXT: {{ $}}
+ ; gfx90a-PAD100-NEXT: bb.1:
+ ; gfx90a-PAD100-NEXT: successors: %bb.2(0x80000000)
+ ; gfx90a-PAD100-NEXT: {{ $}}
+ ; gfx90a-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD100-NEXT: {{ $}}
+ ; gfx90a-PAD100-NEXT: bb.2:
+ ; gfx90a-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx90a-PAD100-NEXT: S_NOP 7
+ ; gfx90a-PAD100-NEXT: S_NOP 5
+ ; gfx90a-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-DEFAULT-LABEL: name: mfma_padding_16_pass_2_preds
+ ; gfx940-DEFAULT: bb.0:
+ ; gfx940-DEFAULT-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; gfx940-DEFAULT-NEXT: {{ $}}
+ ; gfx940-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-DEFAULT-NEXT: S_CBRANCH_VCCZ %bb.2, implicit undef $vcc
+ ; gfx940-DEFAULT-NEXT: {{ $}}
+ ; gfx940-DEFAULT-NEXT: bb.1:
+ ; gfx940-DEFAULT-NEXT: successors: %bb.2(0x80000000)
+ ; gfx940-DEFAULT-NEXT: {{ $}}
+ ; gfx940-DEFAULT-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-DEFAULT-NEXT: {{ $}}
+ ; gfx940-DEFAULT-NEXT: bb.2:
+ ; gfx940-DEFAULT-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-DEFAULT-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-PAD50-LABEL: name: mfma_padding_16_pass_2_preds
+ ; gfx940-PAD50: bb.0:
+ ; gfx940-PAD50-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; gfx940-PAD50-NEXT: {{ $}}
+ ; gfx940-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-PAD50-NEXT: S_CBRANCH_VCCZ %bb.2, implicit undef $vcc
+ ; gfx940-PAD50-NEXT: {{ $}}
+ ; gfx940-PAD50-NEXT: bb.1:
+ ; gfx940-PAD50-NEXT: successors: %bb.2(0x80000000)
+ ; gfx940-PAD50-NEXT: {{ $}}
+ ; gfx940-PAD50-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD50-NEXT: {{ $}}
+ ; gfx940-PAD50-NEXT: bb.2:
+ ; gfx940-PAD50-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD50-NEXT: S_NOP 5
+ ; gfx940-PAD50-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ;
+ ; gfx940-PAD100-LABEL: name: mfma_padding_16_pass_2_preds
+ ; gfx940-PAD100: bb.0:
+ ; gfx940-PAD100-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000)
+ ; gfx940-PAD100-NEXT: {{ $}}
+ ; gfx940-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
+ ; gfx940-PAD100-NEXT: S_CBRANCH_VCCZ %bb.2, implicit undef $vcc
+ ; gfx940-PAD100-NEXT: {{ $}}
+ ; gfx940-PAD100-NEXT: bb.1:
+ ; gfx940-PAD100-NEXT: successors: %bb.2(0x80000000)
+ ; gfx940-PAD100-NEXT: {{ $}}
+ ; gfx940-PAD100-NEXT: $vgpr2 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD100-NEXT: {{ $}}
+ ; gfx940-PAD100-NEXT: bb.2:
+ ; gfx940-PAD100-NEXT: $vgpr3 = V_MOV_B32_e32 1, implicit $exec
+ ; gfx940-PAD100-NEXT: S_NOP 7
+ ; gfx940-PAD100-NEXT: S_NOP 5
+ ; gfx940-PAD100-NEXT: early-clobber $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
bb.0:
$agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = V_MFMA_F32_32X32X2F32_e64 $vgpr1, $vgpr0, $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, 0, 0, 0, implicit $mode, implicit $exec
S_CBRANCH_VCCZ %bb.2, implicit undef $vcc
diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
index 34e67d0..9999cb9 100644
--- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
+++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll
@@ -32,7 +32,6 @@ define hidden void @_ZL3barv() #0 !dbg !1644 {
; CHECK-NEXT: s_mov_b64 s[20:21], s[0:1]
; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21]
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
-; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: .Ltmp1:
diff --git a/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
new file mode 100644
index 0000000..538ce15
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/pal-metadata-3.0-callable.ll
@@ -0,0 +1,305 @@
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s
+
+; CHECK: .amdgpu_pal_metadata
+; CHECK-NEXT: ---
+; CHECK-NEXT: amdpal.pipelines:
+; CHECK-NEXT: - .api: Vulkan
+; CHECK-NEXT: .compute_registers:
+; CHECK-NEXT: .tg_size_en: true
+; CHECK-NEXT: .tgid_x_en: false
+; CHECK-NEXT: .tgid_y_en: false
+; CHECK-NEXT: .tgid_z_en: false
+; CHECK-NEXT: .tidig_comp_cnt: 0x1
+; CHECK-NEXT: .hardware_stages:
+; CHECK-NEXT: .cs:
+; CHECK-NEXT: .checksum_value: 0x9444d7d0
+; CHECK-NEXT: .debug_mode: 0
+; CHECK-NEXT: .excp_en: 0
+; CHECK-NEXT: .float_mode: 0xc0
+; CHECK-NEXT: .ieee_mode: true
+; CHECK-NEXT: .image_op: false
+; CHECK-NEXT: .lds_size: 0x200
+; CHECK-NEXT: .mem_ordered: true
+; CHECK-NEXT: .sgpr_limit: 0x6a
+; CHECK-NEXT: .threadgroup_dimensions:
+; CHECK-NEXT: - 0x1
+; CHECK-NEXT: - 0x400
+; CHECK-NEXT: - 0x1
+; CHECK-NEXT: .trap_present: false
+; CHECK-NEXT: .user_data_reg_map:
+; CHECK-NEXT: - 0x10000000
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: - 0xffffffff
+; CHECK-NEXT: .user_sgprs: 0x3
+; CHECK-NEXT: .vgpr_limit: 0x100
+; CHECK-NEXT: .wavefront_size: 0x40
+; CHECK-NEXT: .wgp_mode: true
+; CHECK: .registers: {}
+; CHECK-NEXT: .shader_functions:
+; CHECK-NEXT: dynamic_stack:
+; CHECK-NEXT: .backend_stack_size: 0x10
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x22
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT: .vgpr_count: 0x2
+; CHECK-NEXT: dynamic_stack_loop:
+; CHECK-NEXT: .backend_stack_size: 0x10
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x22
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT: .vgpr_count: 0x3
+; CHECK-NEXT: multiple_stack:
+; CHECK-NEXT: .backend_stack_size: 0x24
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x21
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x24
+; CHECK-NEXT: .vgpr_count: 0x3
+; CHECK-NEXT: no_stack:
+; CHECK-NEXT: .backend_stack_size: 0
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x20
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0
+; CHECK-NEXT: .vgpr_count: 0x1
+; CHECK-NEXT: no_stack_call:
+; CHECK-NEXT: .backend_stack_size: 0x10
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x22
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT: .vgpr_count: 0x3
+; CHECK-NEXT: no_stack_extern_call:
+; CHECK-NEXT: .backend_stack_size: 0x10
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x29
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT: .vgpr_count: 0x58
+; CHECK-NEXT: no_stack_extern_call_many_args:
+; CHECK-NEXT: .backend_stack_size: 0x90
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x29
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x90
+; CHECK-NEXT: .vgpr_count: 0x58
+; CHECK-NEXT: no_stack_indirect_call:
+; CHECK-NEXT: .backend_stack_size: 0x10
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x29
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT: .vgpr_count: 0x58
+; CHECK-NEXT: simple_lds:
+; CHECK-NEXT: .backend_stack_size: 0
+; CHECK-NEXT: .lds_size: 0x100
+; CHECK-NEXT: .sgpr_count: 0x20
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0
+; CHECK-NEXT: .vgpr_count: 0x1
+; CHECK-NEXT: simple_lds_recurse:
+; CHECK-NEXT: .backend_stack_size: 0x10
+; CHECK-NEXT: .lds_size: 0x100
+; CHECK-NEXT: .sgpr_count: 0x24
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x10
+; CHECK-NEXT: .vgpr_count: 0x29
+; CHECK-NEXT: simple_stack:
+; CHECK-NEXT: .backend_stack_size: 0x14
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x21
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x14
+; CHECK-NEXT: .vgpr_count: 0x2
+; CHECK-NEXT: simple_stack_call:
+; CHECK-NEXT: .backend_stack_size: 0x20
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x22
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
+; CHECK-NEXT: .vgpr_count: 0x4
+; CHECK-NEXT: simple_stack_extern_call:
+; CHECK-NEXT: .backend_stack_size: 0x20
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x29
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
+; CHECK-NEXT: .vgpr_count: 0x58
+; CHECK-NEXT: simple_stack_indirect_call:
+; CHECK-NEXT: .backend_stack_size: 0x20
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x29
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
+; CHECK-NEXT: .vgpr_count: 0x58
+; CHECK-NEXT: simple_stack_recurse:
+; CHECK-NEXT: .backend_stack_size: 0x20
+; CHECK-NEXT: .lds_size: 0
+; CHECK-NEXT: .sgpr_count: 0x24
+; CHECK-NEXT: .stack_frame_size_in_bytes: 0x20
+; CHECK-NEXT: .vgpr_count: 0x2a
+; CHECK:amdpal.version:
+; CHECK-NEXT: - 0x3
+; CHECK-NEXT: - 0
+; CHECK-NEXT:...
+; CHECK-NEXT: .end_amdgpu_pal_metadata
+
+declare amdgpu_gfx float @extern_func(float) #0
+declare amdgpu_gfx float @extern_func_many_args(<64 x float>) #0
+
+@funcptr = external hidden unnamed_addr addrspace(4) constant ptr, align 4
+
+define amdgpu_gfx float @no_stack(float %arg0) #0 {
+ %add = fadd float %arg0, 1.0
+ ret float %add
+}
+
+define amdgpu_gfx float @simple_stack(float %arg0) #0 {
+ %stack = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack
+ %val = load volatile float, ptr addrspace(5) %stack
+ %add = fadd float %arg0, %val
+ ret float %add
+}
+
+define amdgpu_gfx float @multiple_stack(float %arg0) #0 {
+ %stack = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack
+ %val = load volatile float, ptr addrspace(5) %stack
+ %add = fadd float %arg0, %val
+ %stack2 = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack2
+ %val2 = load volatile float, ptr addrspace(5) %stack2
+ %add2 = fadd float %add, %val2
+ ret float %add2
+}
+
+define amdgpu_gfx float @dynamic_stack(float %arg0) #0 {
+bb0:
+ %cmp = fcmp ogt float %arg0, 0.0
+ br i1 %cmp, label %bb1, label %bb2
+
+bb1:
+ %stack = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack
+ %val = load volatile float, ptr addrspace(5) %stack
+ %add = fadd float %arg0, %val
+ br label %bb2
+
+bb2:
+ %res = phi float [ 0.0, %bb0 ], [ %add, %bb1 ]
+ ret float %res
+}
+
+define amdgpu_gfx float @dynamic_stack_loop(float %arg0) #0 {
+bb0:
+ br label %bb1
+
+bb1:
+ %ctr = phi i32 [ 0, %bb0 ], [ %newctr, %bb1 ]
+ %stack = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack
+ %val = load volatile float, ptr addrspace(5) %stack
+ %add = fadd float %arg0, %val
+ %cmp = icmp sgt i32 %ctr, 0
+ %newctr = sub i32 %ctr, 1
+ br i1 %cmp, label %bb1, label %bb2
+
+bb2:
+ ret float %add
+}
+
+define amdgpu_gfx float @no_stack_call(float %arg0) #0 {
+ %res = call amdgpu_gfx float @simple_stack(float %arg0)
+ ret float %res
+}
+
+define amdgpu_gfx float @simple_stack_call(float %arg0) #0 {
+ %stack = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack
+ %val = load volatile float, ptr addrspace(5) %stack
+ %res = call amdgpu_gfx float @simple_stack(float %arg0)
+ %add = fadd float %res, %val
+ ret float %add
+}
+
+define amdgpu_gfx float @no_stack_extern_call(float %arg0) #0 {
+ %res = call amdgpu_gfx float @extern_func(float %arg0)
+ ret float %res
+}
+
+define amdgpu_gfx float @simple_stack_extern_call(float %arg0) #0 {
+ %stack = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack
+ %val = load volatile float, ptr addrspace(5) %stack
+ %res = call amdgpu_gfx float @extern_func(float %arg0)
+ %add = fadd float %res, %val
+ ret float %add
+}
+
+define amdgpu_gfx float @no_stack_extern_call_many_args(<64 x float> %arg0) #0 {
+ %res = call amdgpu_gfx float @extern_func_many_args(<64 x float> %arg0)
+ ret float %res
+}
+
+define amdgpu_gfx float @no_stack_indirect_call(float %arg0) #0 {
+ %fptr = load ptr, ptr addrspace(4) @funcptr
+ call amdgpu_gfx void %fptr()
+ ret float %arg0
+}
+
+define amdgpu_gfx float @simple_stack_indirect_call(float %arg0) #0 {
+ %stack = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack
+ %val = load volatile float, ptr addrspace(5) %stack
+ %fptr = load ptr, ptr addrspace(4) @funcptr
+ call amdgpu_gfx void %fptr()
+ %add = fadd float %arg0, %val
+ ret float %add
+}
+
+define amdgpu_gfx float @simple_stack_recurse(float %arg0) #0 {
+ %stack = alloca float, i32 4, align 4, addrspace(5)
+ store volatile float 2.0, ptr addrspace(5) %stack
+ %val = load volatile float, ptr addrspace(5) %stack
+ %res = call amdgpu_gfx float @simple_stack_recurse(float %arg0)
+ %add = fadd float %res, %val
+ ret float %add
+}
+
+@lds = internal addrspace(3) global [64 x float] undef
+
+define amdgpu_gfx float @simple_lds(float %arg0) #0 {
+ %val = load float, ptr addrspace(3) @lds
+ ret float %val
+}
+
+define amdgpu_gfx float @simple_lds_recurse(float %arg0) #0 {
+ %val = load float, ptr addrspace(3) @lds
+ %res = call amdgpu_gfx float @simple_lds_recurse(float %val)
+ ret float %res
+}
+
+attributes #0 = { nounwind }
+
+!amdgpu.pal.metadata.msgpack = !{!0}
+
+!0 = !{!"\82\B0amdpal.pipelines\91\8A\A4.api\A6Vulkan\B2.compute_registers\85\AB.tg_size_en\C3\AA.tgid_x_en\C2\AA.tgid_y_en\C2\AA.tgid_z_en\C2\AF.tidig_comp_cnt\01\B0.hardware_stages\81\A3.cs\8C\AF.checksum_value\CE\94D\D7\D0\AB.debug_mode\00\AB.float_mode\CC\C0\A9.image_op\C2\AC.mem_ordered\C3\AB.sgpr_limitj\B7.threadgroup_dimensions\93\01\CD\04\00\01\AD.trap_present\00\B2.user_data_reg_map\DC\00 \CE\10\00\00\00\CE\FF\FF\FF\FF\00\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\CE\FF\FF\FF\FF\AB.user_sgprs\03\AB.vgpr_limit\CD\01\00\AF.wavefront_size@\B7.internal_pipeline_hash\92\CF\E7\10k\A6:\A6%\F7\CF\B2\1F\1A\D4{\DA\E1T\AA.registers\80\A8.shaders\81\A8.compute\82\B0.api_shader_hash\92\CF\E9Zn7}\1E\B9\E7\00\B1.hardware_mapping\91\A3.cs\B0.spill_threshold\CE\FF\FF\FF\FF\A5.type\A2Cs\B0.user_data_limit\01\AF.xgl_cache_info\82\B3.128_bit_cache_hash\92\CF\B4X\B8\11[\A4\88P\CF\A0;\B0\AF\FF\B4\BE\C0\AD.llpc_version\A461.1\AEamdpal.version\92\03\00"}
+!1 = !{i32 7}
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
index a70488a..a030f86 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernarg-header.ll
@@ -1,17 +1,20 @@
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,HSA %s
-; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,HSA %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,NON-HSA %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | FileCheck -check-prefixes=GCN,HSA,ASM %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,HSA,OBJ %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -filetype=obj < %s | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,NON-HSA,OBJ %s
+; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -amdgpu-kernarg-preload-count=1 -asm-verbose=0 < %s | llvm-mc -triple amdgcn-amd-amdhsa -mcpu=gfx940 -filetype=obj | llvm-objdump --arch=amdgcn --mcpu=gfx940 --disassemble - | FileCheck -check-prefixes=GCN,HSA,OBJ %s
; GCN: preload_kernarg_header
; HSA: s_trap 2
; NON-HSA: s_endpgm
-; GCN-COUNT-63: s_nop 0
+; ASM: .fill 63, 4, 0xbf800000 ; s_nop 0
+; OBJ-COUNT-63: s_nop 0
define amdgpu_kernel void @preload_kernarg_header(ptr %arg) {
store ptr %arg, ptr %arg
ret void
}
; GCN: non_kernel_function
+; GCN-NOT: s_trap 2
; GCN-NOT: s_nop 0
; GCN: flat_store
define void @non_kernel_function(ptr %arg) {
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll
index e7488e0..20edbd6 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs-inreg-hints.ll
@@ -157,27 +157,27 @@ define amdgpu_kernel void @test_preload_hint_kernel_1_call_func(ptr %0) #0 {
define amdgpu_kernel void @test_preload_hint_kernel_1_call_intrinsic(i16 %0) #0 {
; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic
-; NO-PRELOAD-SAME: (i16 [[TMP0:%.*]]) #[[ATTR2]] {
+; NO-PRELOAD-SAME: (i16 [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
; NO-PRELOAD-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]])
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic
-; PRELOAD-1-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR2]] {
+; PRELOAD-1-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
; PRELOAD-1-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]])
; PRELOAD-1-NEXT: ret void
;
; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic
-; PRELOAD-3-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR2]] {
+; PRELOAD-3-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
; PRELOAD-3-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]])
; PRELOAD-3-NEXT: ret void
;
; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic
-; PRELOAD-16-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR2]] {
+; PRELOAD-16-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
; PRELOAD-16-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]])
; PRELOAD-16-NEXT: ret void
;
; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_1_call_intrinsic
-; PRELOAD-20-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR2]] {
+; PRELOAD-20-SAME: (i16 inreg [[TMP0:%.*]]) #[[ATTR3:[0-9]+]] {
; PRELOAD-20-NEXT: call void @llvm.amdgcn.set.prio(i16 [[TMP0]])
; PRELOAD-20-NEXT: ret void
;
@@ -235,23 +235,23 @@ define amdgpu_kernel void @test_preload_hint_kernel_2_preexisting(i32 inreg %0,
define amdgpu_kernel void @test_preload_hint_kernel_incompatible_attributes(ptr addrspace(4) byref(i32) %0, ptr nest %1) {
; NO-PRELOAD-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes
-; NO-PRELOAD-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+; NO-PRELOAD-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
; NO-PRELOAD-NEXT: ret void
;
; PRELOAD-1-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes
-; PRELOAD-1-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+; PRELOAD-1-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
; PRELOAD-1-NEXT: ret void
;
; PRELOAD-3-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes
-; PRELOAD-3-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+; PRELOAD-3-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
; PRELOAD-3-NEXT: ret void
;
; PRELOAD-16-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes
-; PRELOAD-16-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+; PRELOAD-16-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
; PRELOAD-16-NEXT: ret void
;
; PRELOAD-20-LABEL: define {{[^@]+}}@test_preload_hint_kernel_incompatible_attributes
-; PRELOAD-20-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR3:[0-9]+]] {
+; PRELOAD-20-SAME: (ptr addrspace(4) byref(i32) [[TMP0:%.*]], ptr nest [[TMP1:%.*]]) #[[ATTR4:[0-9]+]] {
; PRELOAD-20-NEXT: ret void
;
ret void
diff --git a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
index d20c3a4..f0e709b 100644
--- a/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
+++ b/llvm/test/CodeGen/AMDGPU/preload-kernargs.ll
@@ -24,70 +24,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
; GFX940-PRELOAD-1-LABEL: ptr1_i8:
-; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
+; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-1-NEXT: ; %bb.0:
; GFX940-PRELOAD-1-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
@@ -98,70 +36,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
; GFX940-PRELOAD-1-NEXT: s_endpgm
;
; GFX940-PRELOAD-2-LABEL: ptr1_i8:
-; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-2-NEXT: ; %bb.0:
; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xff
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
@@ -170,70 +46,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
; GFX940-PRELOAD-4-LABEL: ptr1_i8:
-; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
+; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-4-NEXT: ; %bb.0:
; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xff
; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
@@ -242,70 +56,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
; GFX940-PRELOAD-4-NEXT: s_endpgm
;
; GFX940-PRELOAD-8-LABEL: ptr1_i8:
-; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xff
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
@@ -325,70 +77,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
; GFX90a-PRELOAD-1-LABEL: ptr1_i8:
-; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
+; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-1-NEXT: ; %bb.0:
; GFX90a-PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8
; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
@@ -399,70 +89,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
; GFX90a-PRELOAD-1-NEXT: s_endpgm
;
; GFX90a-PRELOAD-2-LABEL: ptr1_i8:
-; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
; GFX90a-PRELOAD-2-NEXT: s_and_b32 s0, s8, 0xff
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
@@ -471,70 +99,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
; GFX90a-PRELOAD-4-LABEL: ptr1_i8:
-; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
+; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-4-NEXT: ; %bb.0:
; GFX90a-PRELOAD-4-NEXT: s_and_b32 s0, s8, 0xff
; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
@@ -543,70 +109,8 @@ define amdgpu_kernel void @ptr1_i8(ptr addrspace(1) %out, i8 %arg0) {
; GFX90a-PRELOAD-4-NEXT: s_endpgm
;
; GFX90a-PRELOAD-8-LABEL: ptr1_i8:
-; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
; GFX90a-PRELOAD-8-NEXT: s_and_b32 s0, s8, 0xff
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
@@ -631,70 +135,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
; GFX940-PRELOAD-1-LABEL: ptr1_i8_zext_arg:
-; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
+; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-1-NEXT: ; %bb.0:
; GFX940-PRELOAD-1-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
@@ -705,70 +147,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
; GFX940-PRELOAD-1-NEXT: s_endpgm
;
; GFX940-PRELOAD-2-LABEL: ptr1_i8_zext_arg:
-; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-2-NEXT: ; %bb.0:
; GFX940-PRELOAD-2-NEXT: s_mov_b32 s0, 0xffff
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4
@@ -778,70 +158,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
; GFX940-PRELOAD-4-LABEL: ptr1_i8_zext_arg:
-; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
+; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-4-NEXT: ; %bb.0:
; GFX940-PRELOAD-4-NEXT: s_mov_b32 s0, 0xffff
; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s4
@@ -851,70 +169,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
; GFX940-PRELOAD-4-NEXT: s_endpgm
;
; GFX940-PRELOAD-8-LABEL: ptr1_i8_zext_arg:
-; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: s_mov_b32 s0, 0xffff
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4
@@ -935,70 +191,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
; GFX90a-PRELOAD-1-LABEL: ptr1_i8_zext_arg:
-; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
+; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-1-NEXT: ; %bb.0:
; GFX90a-PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8
; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
@@ -1009,70 +203,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
; GFX90a-PRELOAD-1-NEXT: s_endpgm
;
; GFX90a-PRELOAD-2-LABEL: ptr1_i8_zext_arg:
-; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
; GFX90a-PRELOAD-2-NEXT: s_mov_b32 s0, 0xffff
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8
@@ -1082,70 +214,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
; GFX90a-PRELOAD-4-LABEL: ptr1_i8_zext_arg:
-; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
+; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-4-NEXT: ; %bb.0:
; GFX90a-PRELOAD-4-NEXT: s_mov_b32 s0, 0xffff
; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s8
@@ -1155,70 +225,8 @@ define amdgpu_kernel void @ptr1_i8_zext_arg(ptr addrspace(1) %out, i8 zeroext %a
; GFX90a-PRELOAD-4-NEXT: s_endpgm
;
; GFX90a-PRELOAD-8-LABEL: ptr1_i8_zext_arg:
-; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
; GFX90a-PRELOAD-8-NEXT: s_mov_b32 s0, 0xffff
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8
@@ -1244,70 +252,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
; GFX940-PRELOAD-1-LABEL: ptr1_i16_preload_arg:
-; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
+; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-1-NEXT: ; %bb.0:
; GFX940-PRELOAD-1-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
@@ -1318,70 +264,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
; GFX940-PRELOAD-1-NEXT: s_endpgm
;
; GFX940-PRELOAD-2-LABEL: ptr1_i16_preload_arg:
-; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-2-NEXT: ; %bb.0:
; GFX940-PRELOAD-2-NEXT: s_and_b32 s0, s4, 0xffff
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
@@ -1390,70 +274,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
; GFX940-PRELOAD-4-LABEL: ptr1_i16_preload_arg:
-; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
+; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-4-NEXT: ; %bb.0:
; GFX940-PRELOAD-4-NEXT: s_and_b32 s0, s4, 0xffff
; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
@@ -1462,70 +284,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
; GFX940-PRELOAD-4-NEXT: s_endpgm
;
; GFX940-PRELOAD-8-LABEL: ptr1_i16_preload_arg:
-; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: s_and_b32 s0, s4, 0xffff
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
@@ -1545,70 +305,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
; GFX90a-PRELOAD-1-LABEL: ptr1_i16_preload_arg:
-; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
+; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-1-NEXT: ; %bb.0:
; GFX90a-PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8
; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
@@ -1619,70 +317,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
; GFX90a-PRELOAD-1-NEXT: s_endpgm
;
; GFX90a-PRELOAD-2-LABEL: ptr1_i16_preload_arg:
-; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
; GFX90a-PRELOAD-2-NEXT: s_and_b32 s0, s8, 0xffff
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
@@ -1691,70 +327,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
; GFX90a-PRELOAD-4-LABEL: ptr1_i16_preload_arg:
-; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
+; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-4-NEXT: ; %bb.0:
; GFX90a-PRELOAD-4-NEXT: s_and_b32 s0, s8, 0xffff
; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
@@ -1763,70 +337,8 @@ define amdgpu_kernel void @ptr1_i16_preload_arg(ptr addrspace(1) %out, i16 %arg0
; GFX90a-PRELOAD-4-NEXT: s_endpgm
;
; GFX90a-PRELOAD-8-LABEL: ptr1_i16_preload_arg:
-; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
; GFX90a-PRELOAD-8-NEXT: s_and_b32 s0, s8, 0xffff
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
@@ -1850,70 +362,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
; GFX940-PRELOAD-1-LABEL: ptr1_i32_preload_arg:
-; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
+; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-1-NEXT: ; %bb.0:
; GFX940-PRELOAD-1-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
@@ -1923,70 +373,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
; GFX940-PRELOAD-1-NEXT: s_endpgm
;
; GFX940-PRELOAD-2-LABEL: ptr1_i32_preload_arg:
-; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-2-NEXT: ; %bb.0:
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s4
@@ -1994,70 +382,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
; GFX940-PRELOAD-4-LABEL: ptr1_i32_preload_arg:
-; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
+; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-4-NEXT: ; %bb.0:
; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s4
@@ -2065,70 +391,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
; GFX940-PRELOAD-4-NEXT: s_endpgm
;
; GFX940-PRELOAD-8-LABEL: ptr1_i32_preload_arg:
-; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s4
@@ -2146,70 +410,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
; GFX90a-PRELOAD-1-LABEL: ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
+; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-1-NEXT: ; %bb.0:
; GFX90a-PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8
; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
@@ -2219,70 +421,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
; GFX90a-PRELOAD-1-NEXT: s_endpgm
;
; GFX90a-PRELOAD-2-LABEL: ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s8
@@ -2290,70 +430,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
; GFX90a-PRELOAD-4-LABEL: ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
+; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-4-NEXT: ; %bb.0:
; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s8
@@ -2361,70 +439,8 @@ define amdgpu_kernel void @ptr1_i32_preload_arg(ptr addrspace(1) %out, i32 %arg0
; GFX90a-PRELOAD-4-NEXT: s_endpgm
;
; GFX90a-PRELOAD-8-LABEL: ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s8
@@ -2449,70 +465,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
; GFX940-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg:
-; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
+; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-1-NEXT: ; %bb.0:
; GFX940-PRELOAD-1-NEXT: s_load_dword s3, s[0:1], 0x10
; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x8
@@ -2524,70 +478,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
; GFX940-PRELOAD-1-NEXT: s_endpgm
;
; GFX940-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg:
-; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-2-NEXT: ; %bb.0:
; GFX940-PRELOAD-2-NEXT: s_load_dword s0, s[0:1], 0x10
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
@@ -2598,70 +490,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
; GFX940-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg:
-; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
+; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-4-NEXT: ; %bb.0:
; GFX940-PRELOAD-4-NEXT: s_add_i32 s0, s2, s6
; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
@@ -2670,70 +500,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
; GFX940-PRELOAD-4-NEXT: s_endpgm
;
; GFX940-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg:
-; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: s_add_i32 s0, s2, s6
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
@@ -2754,70 +522,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
; GFX90a-PRELOAD-1-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
+; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-1-NEXT: ; %bb.0:
; GFX90a-PRELOAD-1-NEXT: s_load_dword s2, s[4:5], 0x10
; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
@@ -2829,70 +535,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
; GFX90a-PRELOAD-1-NEXT: s_endpgm
;
; GFX90a-PRELOAD-2-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
; GFX90a-PRELOAD-2-NEXT: s_load_dword s0, s[4:5], 0x10
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
@@ -2903,70 +547,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
; GFX90a-PRELOAD-4-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
+; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-4-NEXT: ; %bb.0:
; GFX90a-PRELOAD-4-NEXT: s_add_i32 s0, s6, s10
; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
@@ -2975,70 +557,8 @@ define amdgpu_kernel void @i32_ptr1_i32_preload_arg(i32 %arg0, ptr addrspace(1)
; GFX90a-PRELOAD-4-NEXT: s_endpgm
;
; GFX90a-PRELOAD-8-LABEL: i32_ptr1_i32_preload_arg:
-; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
; GFX90a-PRELOAD-8-NEXT: s_add_i32 s0, s6, s10
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
@@ -3065,70 +585,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
; GFX940-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg:
-; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
+; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-1-NEXT: ; %bb.0:
; GFX940-PRELOAD-1-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
@@ -3141,70 +599,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
; GFX940-PRELOAD-1-NEXT: s_endpgm
;
; GFX940-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg:
-; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-2-NEXT: ; %bb.0:
; GFX940-PRELOAD-2-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-PRELOAD-2-NEXT: s_and_b32 s1, s4, 0xffff
@@ -3217,70 +613,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
; GFX940-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg:
-; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
+; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-4-NEXT: ; %bb.0:
; GFX940-PRELOAD-4-NEXT: s_lshr_b32 s0, s4, 16
; GFX940-PRELOAD-4-NEXT: s_and_b32 s1, s4, 0xffff
@@ -3291,70 +625,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
; GFX940-PRELOAD-4-NEXT: s_endpgm
;
; GFX940-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg:
-; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 16
; GFX940-PRELOAD-8-NEXT: s_and_b32 s1, s4, 0xffff
@@ -3378,70 +650,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
; GFX90a-PRELOAD-1-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
+; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-1-NEXT: ; %bb.0:
; GFX90a-PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8
; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
@@ -3454,70 +664,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
; GFX90a-PRELOAD-1-NEXT: s_endpgm
;
; GFX90a-PRELOAD-2-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
; GFX90a-PRELOAD-2-NEXT: s_load_dword s0, s[4:5], 0x8
; GFX90a-PRELOAD-2-NEXT: s_and_b32 s1, s8, 0xffff
@@ -3530,70 +678,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
; GFX90a-PRELOAD-4-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
+; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-4-NEXT: ; %bb.0:
; GFX90a-PRELOAD-4-NEXT: s_lshr_b32 s0, s8, 16
; GFX90a-PRELOAD-4-NEXT: s_and_b32 s1, s8, 0xffff
@@ -3604,70 +690,8 @@ define amdgpu_kernel void @ptr1_i16_i16_preload_arg(ptr addrspace(1) %out, i16 %
; GFX90a-PRELOAD-4-NEXT: s_endpgm
;
; GFX90a-PRELOAD-8-LABEL: ptr1_i16_i16_preload_arg:
-; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 16
; GFX90a-PRELOAD-8-NEXT: s_and_b32 s1, s8, 0xffff
@@ -3695,70 +719,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
; GFX940-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg:
-; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
+; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-1-NEXT: ; %bb.0:
; GFX940-PRELOAD-1-NEXT: s_load_dword s0, s[0:1], 0x8
; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
@@ -3768,70 +730,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
; GFX940-PRELOAD-1-NEXT: s_endpgm
;
; GFX940-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg:
-; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-2-NEXT: ; %bb.0:
; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8
; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0
@@ -3841,70 +741,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
; GFX940-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg:
-; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
+; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-4-NEXT: ; %bb.0:
; GFX940-PRELOAD-4-NEXT: s_lshr_b32 s0, s4, 8
; GFX940-PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0
@@ -3914,70 +752,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
; GFX940-PRELOAD-4-NEXT: s_endpgm
;
; GFX940-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg:
-; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8
; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0
@@ -3997,70 +773,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
; GFX90a-PRELOAD-1-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
+; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-1-NEXT: ; %bb.0:
; GFX90a-PRELOAD-1-NEXT: s_load_dword s0, s[4:5], 0x8
; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
@@ -4070,70 +784,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
; GFX90a-PRELOAD-1-NEXT: s_endpgm
;
; GFX90a-PRELOAD-2-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8
; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0
@@ -4143,70 +795,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
; GFX90a-PRELOAD-4-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
+; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-4-NEXT: ; %bb.0:
; GFX90a-PRELOAD-4-NEXT: s_lshr_b32 s0, s8, 8
; GFX90a-PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0
@@ -4216,70 +806,8 @@ define amdgpu_kernel void @ptr1_v2i8_preload_arg(ptr addrspace(1) %out, <2 x i8>
; GFX90a-PRELOAD-4-NEXT: s_endpgm
;
; GFX90a-PRELOAD-8-LABEL: ptr1_v2i8_preload_arg:
-; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8
; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0
@@ -4308,70 +836,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
; GFX940-PRELOAD-1-LABEL: byref_preload_arg:
-; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
+; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-1-NEXT: ; %bb.0:
; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100
; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
@@ -4385,70 +851,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
; GFX940-PRELOAD-1-NEXT: s_endpgm
;
; GFX940-PRELOAD-2-LABEL: byref_preload_arg:
-; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-2-NEXT: ; %bb.0:
; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
@@ -4462,70 +866,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
; GFX940-PRELOAD-4-LABEL: byref_preload_arg:
-; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
+; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-4-NEXT: ; %bb.0:
; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100
; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
@@ -4539,70 +881,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
; GFX940-PRELOAD-4-NEXT: s_endpgm
;
; GFX940-PRELOAD-8-LABEL: byref_preload_arg:
-; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x100
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
@@ -4630,70 +910,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
; GFX90a-PRELOAD-1-LABEL: byref_preload_arg:
-; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
+; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-1-NEXT: ; %bb.0:
; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100
; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
@@ -4707,70 +925,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
; GFX90a-PRELOAD-1-NEXT: s_endpgm
;
; GFX90a-PRELOAD-2-LABEL: byref_preload_arg:
-; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
@@ -4784,70 +940,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
; GFX90a-PRELOAD-4-LABEL: byref_preload_arg:
-; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
+; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-4-NEXT: ; %bb.0:
; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100
; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
@@ -4861,70 +955,8 @@ define amdgpu_kernel void @byref_preload_arg(ptr addrspace(1) %out, ptr addrspac
; GFX90a-PRELOAD-4-NEXT: s_endpgm
;
; GFX90a-PRELOAD-8-LABEL: byref_preload_arg:
-; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x100
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
@@ -4964,70 +996,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
; GFX940-PRELOAD-1-LABEL: v8i32_arg:
-; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
+; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-1-NEXT: ; %bb.0:
; GFX940-PRELOAD-1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20
; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0
@@ -5046,70 +1016,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
; GFX940-PRELOAD-1-NEXT: s_endpgm
;
; GFX940-PRELOAD-2-LABEL: v8i32_arg:
-; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-2-NEXT: ; %bb.0:
; GFX940-PRELOAD-2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0
@@ -5128,70 +1036,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
; GFX940-PRELOAD-4-LABEL: v8i32_arg:
-; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
+; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-4-NEXT: ; %bb.0:
; GFX940-PRELOAD-4-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20
; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0
@@ -5210,70 +1056,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
; GFX940-PRELOAD-4-NEXT: s_endpgm
;
; GFX940-PRELOAD-8-LABEL: v8i32_arg:
-; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x20
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0
@@ -5311,70 +1095,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
; GFX90a-PRELOAD-1-LABEL: v8i32_arg:
-; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
+; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-1-NEXT: ; %bb.0:
; GFX90a-PRELOAD-1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v4, 0
@@ -5393,70 +1115,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
; GFX90a-PRELOAD-1-NEXT: s_endpgm
;
; GFX90a-PRELOAD-2-LABEL: v8i32_arg:
-; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
; GFX90a-PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v4, 0
@@ -5475,70 +1135,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
; GFX90a-PRELOAD-4-LABEL: v8i32_arg:
-; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
+; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-4-NEXT: ; %bb.0:
; GFX90a-PRELOAD-4-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v4, 0
@@ -5557,70 +1155,8 @@ define amdgpu_kernel void @v8i32_arg(ptr addrspace(1) nocapture %out, <8 x i32>
; GFX90a-PRELOAD-4-NEXT: s_endpgm
;
; GFX90a-PRELOAD-8-LABEL: v8i32_arg:
-; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
; GFX90a-PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x20
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v4, 0
@@ -5654,70 +1190,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
; GFX940-PRELOAD-1-LABEL: v3i16_preload_arg:
-; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
+; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-1-NEXT: ; %bb.0:
; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
@@ -5729,70 +1203,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX940-PRELOAD-1-NEXT: s_endpgm
;
; GFX940-PRELOAD-2-LABEL: v3i16_preload_arg:
-; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-2-NEXT: ; %bb.0:
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s5
@@ -5802,70 +1214,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
; GFX940-PRELOAD-4-LABEL: v3i16_preload_arg:
-; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
+; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-4-NEXT: ; %bb.0:
; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s5
@@ -5875,70 +1225,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX940-PRELOAD-4-NEXT: s_endpgm
;
; GFX940-PRELOAD-8-LABEL: v3i16_preload_arg:
-; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s5
@@ -5959,70 +1247,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
; GFX90a-PRELOAD-1-LABEL: v3i16_preload_arg:
-; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
+; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-1-NEXT: ; %bb.0:
; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
@@ -6034,70 +1260,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX90a-PRELOAD-1-NEXT: s_endpgm
;
; GFX90a-PRELOAD-2-LABEL: v3i16_preload_arg:
-; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s9
@@ -6107,70 +1271,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
; GFX90a-PRELOAD-4-LABEL: v3i16_preload_arg:
-; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
+; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-4-NEXT: ; %bb.0:
; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s9
@@ -6180,70 +1282,8 @@ define amdgpu_kernel void @v3i16_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX90a-PRELOAD-4-NEXT: s_endpgm
;
; GFX90a-PRELOAD-8-LABEL: v3i16_preload_arg:
-; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, 0
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s9
@@ -6269,70 +1309,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
; GFX940-PRELOAD-1-LABEL: v3i32_preload_arg:
-; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
+; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-1-NEXT: ; %bb.0:
; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0
@@ -6344,70 +1322,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX940-PRELOAD-1-NEXT: s_endpgm
;
; GFX940-PRELOAD-2-LABEL: v3i32_preload_arg:
-; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-2-NEXT: ; %bb.0:
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s7
@@ -6417,70 +1333,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
; GFX940-PRELOAD-4-LABEL: v3i32_preload_arg:
-; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
+; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-4-NEXT: ; %bb.0:
; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s6
; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s7
@@ -6490,70 +1344,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX940-PRELOAD-4-NEXT: s_endpgm
;
; GFX940-PRELOAD-8-LABEL: v3i32_preload_arg:
-; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s7
@@ -6575,70 +1367,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
; GFX90a-PRELOAD-1-LABEL: v3i32_preload_arg:
-; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
+; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-1-NEXT: ; %bb.0:
; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0
@@ -6650,70 +1380,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX90a-PRELOAD-1-NEXT: s_endpgm
;
; GFX90a-PRELOAD-2-LABEL: v3i32_preload_arg:
-; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v1, s11
@@ -6723,70 +1391,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
; GFX90a-PRELOAD-4-LABEL: v3i32_preload_arg:
-; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
+; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-4-NEXT: ; %bb.0:
; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v1, s11
@@ -6796,70 +1402,8 @@ define amdgpu_kernel void @v3i32_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX90a-PRELOAD-4-NEXT: s_endpgm
;
; GFX90a-PRELOAD-8-LABEL: v3i32_preload_arg:
-; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v1, s11
@@ -6885,70 +1429,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
; GFX940-PRELOAD-1-LABEL: v3f32_preload_arg:
-; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
+; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-1-NEXT: ; %bb.0:
; GFX940-PRELOAD-1-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x10
; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0
@@ -6960,70 +1442,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX940-PRELOAD-1-NEXT: s_endpgm
;
; GFX940-PRELOAD-2-LABEL: v3f32_preload_arg:
-; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-2-NEXT: ; %bb.0:
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s6
@@ -7033,70 +1453,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
; GFX940-PRELOAD-4-LABEL: v3f32_preload_arg:
-; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
+; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-4-NEXT: ; %bb.0:
; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0
; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s6
@@ -7106,70 +1464,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX940-PRELOAD-4-NEXT: s_endpgm
;
; GFX940-PRELOAD-8-LABEL: v3f32_preload_arg:
-; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s6
@@ -7191,70 +1487,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
; GFX90a-PRELOAD-1-LABEL: v3f32_preload_arg:
-; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
+; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-1-NEXT: ; %bb.0:
; GFX90a-PRELOAD-1-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10
; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v3, 0
@@ -7266,70 +1500,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX90a-PRELOAD-1-NEXT: s_endpgm
;
; GFX90a-PRELOAD-2-LABEL: v3f32_preload_arg:
-; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v3, 0
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v0, s10
@@ -7339,70 +1511,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
; GFX90a-PRELOAD-4-LABEL: v3f32_preload_arg:
-; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
+; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-4-NEXT: ; %bb.0:
; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v3, 0
; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v0, s10
@@ -7412,70 +1522,8 @@ define amdgpu_kernel void @v3f32_preload_arg(ptr addrspace(1) nocapture %out, <3
; GFX90a-PRELOAD-4-NEXT: s_endpgm
;
; GFX90a-PRELOAD-8-LABEL: v3f32_preload_arg:
-; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v3, 0
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v0, s10
@@ -7500,70 +1548,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
; GFX940-PRELOAD-1-LABEL: v5i8_preload_arg:
-; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
+; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-1-NEXT: ; %bb.0:
; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
@@ -7575,70 +1561,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
; GFX940-PRELOAD-1-NEXT: s_endpgm
;
; GFX940-PRELOAD-2-LABEL: v5i8_preload_arg:
-; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-2-NEXT: ; %bb.0:
; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s4, 8
; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0
@@ -7655,70 +1579,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
; GFX940-PRELOAD-4-LABEL: v5i8_preload_arg:
-; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
+; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-4-NEXT: ; %bb.0:
; GFX940-PRELOAD-4-NEXT: s_lshr_b32 s0, s4, 8
; GFX940-PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0
@@ -7735,70 +1597,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
; GFX940-PRELOAD-4-NEXT: s_endpgm
;
; GFX940-PRELOAD-8-LABEL: v5i8_preload_arg:
-; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s4, 8
; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0
@@ -7826,70 +1626,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
; GFX90a-PRELOAD-1-LABEL: v5i8_preload_arg:
-; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
+; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-1-NEXT: ; %bb.0:
; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v0, 0
@@ -7901,70 +1639,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
; GFX90a-PRELOAD-1-NEXT: s_endpgm
;
; GFX90a-PRELOAD-2-LABEL: v5i8_preload_arg:
-; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s8, 8
; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0
@@ -7981,70 +1657,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
; GFX90a-PRELOAD-4-LABEL: v5i8_preload_arg:
-; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
+; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-4-NEXT: ; %bb.0:
; GFX90a-PRELOAD-4-NEXT: s_lshr_b32 s0, s8, 8
; GFX90a-PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0
@@ -8061,70 +1675,8 @@ define amdgpu_kernel void @v5i8_preload_arg(ptr addrspace(1) nocapture %out, <5
; GFX90a-PRELOAD-4-NEXT: s_endpgm
;
; GFX90a-PRELOAD-8-LABEL: v5i8_preload_arg:
-; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s8, 8
; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0
@@ -8167,70 +1719,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
; GFX940-PRELOAD-1-LABEL: v5f64_arg:
-; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
+; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-1-NEXT: ; %bb.0:
; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60
; GFX940-PRELOAD-1-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40
@@ -8252,70 +1742,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
; GFX940-PRELOAD-1-NEXT: s_endpgm
;
; GFX940-PRELOAD-2-LABEL: v5f64_arg:
-; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-2-NEXT: ; %bb.0:
; GFX940-PRELOAD-2-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60
; GFX940-PRELOAD-2-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40
@@ -8337,70 +1765,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
; GFX940-PRELOAD-4-LABEL: v5f64_arg:
-; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
+; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-4-NEXT: ; %bb.0:
; GFX940-PRELOAD-4-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60
; GFX940-PRELOAD-4-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40
@@ -8422,70 +1788,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
; GFX940-PRELOAD-4-NEXT: s_endpgm
;
; GFX940-PRELOAD-8-LABEL: v5f64_arg:
-; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: s_load_dwordx2 s[12:13], s[0:1], 0x60
; GFX940-PRELOAD-8-NEXT: s_load_dwordx8 s[4:11], s[0:1], 0x40
@@ -8529,70 +1833,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
; GFX90a-PRELOAD-1-LABEL: v5f64_arg:
-; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
+; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-1-NEXT: ; %bb.0:
; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
; GFX90a-PRELOAD-1-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
@@ -8614,70 +1856,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
; GFX90a-PRELOAD-1-NEXT: s_endpgm
;
; GFX90a-PRELOAD-2-LABEL: v5f64_arg:
-; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
; GFX90a-PRELOAD-2-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
; GFX90a-PRELOAD-2-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
@@ -8699,70 +1879,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
; GFX90a-PRELOAD-4-LABEL: v5f64_arg:
-; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
+; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-4-NEXT: ; %bb.0:
; GFX90a-PRELOAD-4-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
; GFX90a-PRELOAD-4-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
@@ -8784,70 +1902,8 @@ define amdgpu_kernel void @v5f64_arg(ptr addrspace(1) nocapture %out, <5 x doubl
; GFX90a-PRELOAD-4-NEXT: s_endpgm
;
; GFX90a-PRELOAD-8-LABEL: v5f64_arg:
-; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
; GFX90a-PRELOAD-8-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x60
; GFX90a-PRELOAD-8-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x40
@@ -8882,70 +1938,8 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
; GFX940-PRELOAD-1-LABEL: v8i8_preload_arg:
-; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
+; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-1-NEXT: ; %bb.0:
; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0
@@ -8955,70 +1949,8 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
; GFX940-PRELOAD-1-NEXT: s_endpgm
;
; GFX940-PRELOAD-2-LABEL: v8i8_preload_arg:
-; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-2-NEXT: ; %bb.0:
; GFX940-PRELOAD-2-NEXT: s_lshr_b32 s0, s5, 8
; GFX940-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0
@@ -9042,70 +1974,8 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
; GFX940-PRELOAD-4-LABEL: v8i8_preload_arg:
-; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
+; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-4-NEXT: ; %bb.0:
; GFX940-PRELOAD-4-NEXT: s_lshr_b32 s0, s5, 8
; GFX940-PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0
@@ -9129,70 +1999,8 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
; GFX940-PRELOAD-4-NEXT: s_endpgm
;
; GFX940-PRELOAD-8-LABEL: v8i8_preload_arg:
-; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: s_lshr_b32 s0, s5, 8
; GFX940-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0
@@ -9225,70 +2033,8 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
; GFX90a-PRELOAD-1-LABEL: v8i8_preload_arg:
-; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
+; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-1-NEXT: ; %bb.0:
; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0
@@ -9298,70 +2044,8 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
; GFX90a-PRELOAD-1-NEXT: s_endpgm
;
; GFX90a-PRELOAD-2-LABEL: v8i8_preload_arg:
-; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
; GFX90a-PRELOAD-2-NEXT: s_lshr_b32 s0, s9, 8
; GFX90a-PRELOAD-2-NEXT: v_lshlrev_b16_e64 v0, 8, s0
@@ -9384,70 +2068,8 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
; GFX90a-PRELOAD-4-LABEL: v8i8_preload_arg:
-; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
+; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-4-NEXT: ; %bb.0:
; GFX90a-PRELOAD-4-NEXT: s_lshr_b32 s0, s9, 8
; GFX90a-PRELOAD-4-NEXT: v_lshlrev_b16_e64 v0, 8, s0
@@ -9470,70 +2092,8 @@ define amdgpu_kernel void @v8i8_preload_arg(ptr addrspace(1) %out, <8 x i8> %in)
; GFX90a-PRELOAD-4-NEXT: s_endpgm
;
; GFX90a-PRELOAD-8-LABEL: v8i8_preload_arg:
-; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
; GFX90a-PRELOAD-8-NEXT: s_lshr_b32 s0, s9, 8
; GFX90a-PRELOAD-8-NEXT: v_lshlrev_b16_e64 v0, 8, s0
@@ -9570,70 +2130,8 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a)
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
; GFX940-PRELOAD-1-LABEL: i64_kernel_preload_arg:
-; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
+; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-1-NEXT: ; %bb.0:
; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0
@@ -9643,70 +2141,8 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a)
; GFX940-PRELOAD-1-NEXT: s_endpgm
;
; GFX940-PRELOAD-2-LABEL: i64_kernel_preload_arg:
-; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-2-NEXT: ; %bb.0:
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0
; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
@@ -9714,70 +2150,8 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a)
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
; GFX940-PRELOAD-4-LABEL: i64_kernel_preload_arg:
-; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
+; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-4-NEXT: ; %bb.0:
; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0
; GFX940-PRELOAD-4-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
@@ -9785,70 +2159,8 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a)
; GFX940-PRELOAD-4-NEXT: s_endpgm
;
; GFX940-PRELOAD-8-LABEL: i64_kernel_preload_arg:
-; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0
; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
@@ -9866,70 +2178,8 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a)
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
; GFX90a-PRELOAD-1-LABEL: i64_kernel_preload_arg:
-; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
+; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-1-NEXT: ; %bb.0:
; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0
@@ -9939,70 +2189,8 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a)
; GFX90a-PRELOAD-1-NEXT: s_endpgm
;
; GFX90a-PRELOAD-2-LABEL: i64_kernel_preload_arg:
-; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0
; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
@@ -10010,70 +2198,8 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a)
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
; GFX90a-PRELOAD-4-LABEL: i64_kernel_preload_arg:
-; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
+; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-4-NEXT: ; %bb.0:
; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0
; GFX90a-PRELOAD-4-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
@@ -10081,70 +2207,8 @@ define amdgpu_kernel void @i64_kernel_preload_arg(ptr addrspace(1) %out, i64 %a)
; GFX90a-PRELOAD-4-NEXT: s_endpgm
;
; GFX90a-PRELOAD-8-LABEL: i64_kernel_preload_arg:
-; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0
; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
@@ -10166,70 +2230,8 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
; GFX940-NO-PRELOAD-NEXT: s_endpgm
;
; GFX940-PRELOAD-1-LABEL: f64_kernel_preload_arg:
-; GFX940-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
-; GFX940-PRELOAD-1-NEXT: s_nop 0
+; GFX940-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-1-NEXT: ; %bb.0:
; GFX940-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x8
; GFX940-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0
@@ -10239,70 +2241,8 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
; GFX940-PRELOAD-1-NEXT: s_endpgm
;
; GFX940-PRELOAD-2-LABEL: f64_kernel_preload_arg:
-; GFX940-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
-; GFX940-PRELOAD-2-NEXT: s_nop 0
+; GFX940-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-2-NEXT: ; %bb.0:
; GFX940-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0
; GFX940-PRELOAD-2-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
@@ -10310,70 +2250,8 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
; GFX940-PRELOAD-2-NEXT: s_endpgm
;
; GFX940-PRELOAD-4-LABEL: f64_kernel_preload_arg:
-; GFX940-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
-; GFX940-PRELOAD-4-NEXT: s_nop 0
+; GFX940-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-4-NEXT: ; %bb.0:
; GFX940-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0
; GFX940-PRELOAD-4-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
@@ -10381,70 +2259,8 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
; GFX940-PRELOAD-4-NEXT: s_endpgm
;
; GFX940-PRELOAD-8-LABEL: f64_kernel_preload_arg:
-; GFX940-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
-; GFX940-PRELOAD-8-NEXT: s_nop 0
+; GFX940-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX940-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX940-PRELOAD-8-NEXT: ; %bb.0:
; GFX940-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0
; GFX940-PRELOAD-8-NEXT: v_mov_b64_e32 v[0:1], s[4:5]
@@ -10462,70 +2278,8 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
; GFX90a-NO-PRELOAD-NEXT: s_endpgm
;
; GFX90a-PRELOAD-1-LABEL: f64_kernel_preload_arg:
-; GFX90a-PRELOAD-1: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
-; GFX90a-PRELOAD-1-NEXT: s_nop 0
+; GFX90a-PRELOAD-1: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-1-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-1-NEXT: ; %bb.0:
; GFX90a-PRELOAD-1-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8
; GFX90a-PRELOAD-1-NEXT: v_mov_b32_e32 v2, 0
@@ -10535,70 +2289,8 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
; GFX90a-PRELOAD-1-NEXT: s_endpgm
;
; GFX90a-PRELOAD-2-LABEL: f64_kernel_preload_arg:
-; GFX90a-PRELOAD-2: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
-; GFX90a-PRELOAD-2-NEXT: s_nop 0
+; GFX90a-PRELOAD-2: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-2-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-2-NEXT: ; %bb.0:
; GFX90a-PRELOAD-2-NEXT: v_mov_b32_e32 v2, 0
; GFX90a-PRELOAD-2-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
@@ -10606,70 +2298,8 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
; GFX90a-PRELOAD-2-NEXT: s_endpgm
;
; GFX90a-PRELOAD-4-LABEL: f64_kernel_preload_arg:
-; GFX90a-PRELOAD-4: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
-; GFX90a-PRELOAD-4-NEXT: s_nop 0
+; GFX90a-PRELOAD-4: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-4-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-4-NEXT: ; %bb.0:
; GFX90a-PRELOAD-4-NEXT: v_mov_b32_e32 v2, 0
; GFX90a-PRELOAD-4-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
@@ -10677,70 +2307,8 @@ define amdgpu_kernel void @f64_kernel_preload_arg(ptr addrspace(1) %out, double
; GFX90a-PRELOAD-4-NEXT: s_endpgm
;
; GFX90a-PRELOAD-8-LABEL: f64_kernel_preload_arg:
-; GFX90a-PRELOAD-8: s_trap 2 ; Trap with incompatible firmware that doesn't support preloading kernel arguments.
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
-; GFX90a-PRELOAD-8-NEXT: s_nop 0
+; GFX90a-PRELOAD-8: s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
+; GFX90a-PRELOAD-8-NEXT: .fill 63, 4, 0xbf800000 ; s_nop 0
; GFX90a-PRELOAD-8-NEXT: ; %bb.0:
; GFX90a-PRELOAD-8-NEXT: v_mov_b32_e32 v2, 0
; GFX90a-PRELOAD-8-NEXT: v_pk_mov_b32 v[0:1], s[8:9], s[8:9] op_sel:[0,1]
diff --git a/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll
new file mode 100644
index 0000000..ab03177
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/promote-alloca-scoring.ll
@@ -0,0 +1,69 @@
+; RUN: opt -S -mtriple=amdgcn-unknown-amdhsa -mcpu=kaveri -debug-only=amdgpu-promote-alloca -amdgpu-promote-alloca-to-vector-limit=512 -passes=amdgpu-promote-alloca %s -o - 2>&1 | FileCheck %s
+; REQUIRES: asserts
+
+; CHECK: Scoring: %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: [+1]: store i32 42, ptr addrspace(5) %simpleuser, align 4
+; CHECK-NEXT: => Final Score:1
+; CHECK-NEXT: Scoring: %manyusers = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: [+1]: store i32 %v0.ext, ptr addrspace(5) %manyusers.1, align 4
+; CHECK-NEXT: [+1]: %v0 = load i8, ptr addrspace(5) %manyusers.1, align 1
+; CHECK-NEXT: [+1]: store i32 %v1.ext, ptr addrspace(5) %manyusers.2, align 4
+; CHECK-NEXT: [+1]: %v1 = load i8, ptr addrspace(5) %manyusers.2, align 1
+; CHECK-NEXT: => Final Score:4
+; CHECK-NEXT: Sorted Worklist:
+; CHECK-NEXT: %manyusers = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+define amdgpu_kernel void @simple_users_scores() #0 {
+entry:
+ ; should get a score of 1
+ %simpleuser = alloca [4 x i64], align 4, addrspace(5)
+ ; should get a score of 4
+ %manyusers = alloca [4 x i64], align 4, addrspace(5)
+
+ store i32 42, ptr addrspace(5) %simpleuser
+
+ %manyusers.1 = getelementptr i8, ptr addrspace(5) %manyusers, i64 2
+ %v0 = load i8, ptr addrspace(5) %manyusers.1
+ %v0.ext = zext i8 %v0 to i32
+ store i32 %v0.ext, ptr addrspace(5) %manyusers.1
+
+ %manyusers.2 = getelementptr i8, ptr addrspace(5) %manyusers, i64 1
+ %v1 = load i8, ptr addrspace(5) %manyusers.2
+ %v1.ext = zext i8 %v0 to i32
+ store i32 %v1.ext, ptr addrspace(5) %manyusers.2
+
+ ret void
+}
+
+; CHECK: Scoring: %stack = alloca [4 x i64], align 4, addrspace(5)
+; CHECK-NEXT: [+5]: store i32 32, ptr addrspace(5) %stack, align 4
+; CHECK-NEXT: [+1]: store i32 42, ptr addrspace(5) %stack, align 4
+; CHECK-NEXT: [+9]: store i32 32, ptr addrspace(5) %stack.1, align 4
+; CHECK-NEXT: [+5]: %outer.cmp = load i1, ptr addrspace(5) %stack.1, align 1
+; CHECK-NEXT: [+1]: store i32 64, ptr addrspace(5) %stack.2, align 4
+; CHECK-NEXT: [+9]: %inner.cmp = load i1, ptr addrspace(5) %stack.2, align 1
+; CHECK-NEXT: => Final Score:30
+define amdgpu_kernel void @loop_users_alloca(i1 %x, i2) #0 {
+entry:
+ ; should get a score of 1
+ %stack = alloca [4 x i64], align 4, addrspace(5)
+ %stack.1 = getelementptr i8, ptr addrspace(5) %stack, i64 4
+ %stack.2 = getelementptr i8, ptr addrspace(5) %stack, i64 8
+
+ store i32 42, ptr addrspace(5) %stack
+ br label %loop.outer
+
+loop.outer:
+ store i32 32, ptr addrspace(5) %stack
+ %outer.cmp = load i1, ptr addrspace(5) %stack.1
+ br label %loop.inner
+
+loop.inner:
+ store i32 32, ptr addrspace(5) %stack.1
+ %inner.cmp = load i1, ptr addrspace(5) %stack.2
+ br i1 %inner.cmp, label %loop.inner, label %loop.outer
+
+exit:
+ store i32 64, ptr addrspace(5) %stack.2
+ ret void
+}
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
index d92ba77..d070dc3 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-flat-work-group-size.ll
@@ -203,13 +203,13 @@ attributes #5 = { "amdgpu-flat-work-group-size"="128,512" }
attributes #6 = { "amdgpu-flat-work-group-size"="512,512" }
attributes #7 = { "amdgpu-flat-work-group-size"="64,256" }
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="128,128" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="64,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="128,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="64,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="128,128" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="512,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="64,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="128,256" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
index 2df219b..f62f1d5 100644
--- a/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
+++ b/llvm/test/CodeGen/AMDGPU/propagate-waves-per-eu.ll
@@ -399,26 +399,26 @@ attributes #17 = { "amdgpu-waves-per-eu"="5,8" }
attributes #18 = { "amdgpu-waves-per-eu"="9,10" }
attributes #19 = { "amdgpu-waves-per-eu"="8,9" }
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR10]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR11]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR12]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR13]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR14]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR15]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,9" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR16]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR17]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR18]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,8" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR19]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR20]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR21]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,2" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,4" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,1" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR5]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,2" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR6]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR7]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR8]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR9]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR10]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR11]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="0,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR12]] = { "amdgpu-flat-work-group-size"="1,64" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="1,123" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR13]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="2,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR14]] = { "amdgpu-flat-work-group-size"="1,512" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="3,6" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR15]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR16]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="6,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR17]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="5,5" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR18]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,8" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR19]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR20]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="9,9" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR21]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="8,9" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir
index 2ccc241..fdfc9b0 100644
--- a/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir
+++ b/llvm/test/CodeGen/AMDGPU/ran-out-of-sgprs-allocation-failure.mir
@@ -24,6 +24,7 @@ registers:
- { id: 10, class: sreg_64_xexec, preferred-register: '$vcc' }
frameInfo:
maxAlignment: 1
+ adjustsStack: true
hasCalls: true
machineFunctionInfo:
maxKernArgAlign: 1
diff --git a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll
index eaef63b..c1d647c 100644
--- a/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll
+++ b/llvm/test/CodeGen/AMDGPU/recursive_global_initializer.ll
@@ -19,5 +19,5 @@ define void @hoge() {
ret void
}
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
index 297a056..384a9c4 100644
--- a/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/remove-no-kernel-id-attribute.ll
@@ -191,11 +191,11 @@ define amdgpu_kernel void @kernel_lds_recursion() {
!1 = !{i32 1, !"amdhsa_code_object_version", i32 400}
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR3]] = { "amdgpu-lds-size"="4" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR4]] = { "amdgpu-lds-size"="2" "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR5:[0-9]+]] = { nocallback nofree nosync nounwind willreturn memory(none) }
; CHECK: attributes #[[ATTR6:[0-9]+]] = { nocallback nofree nosync nounwind speculatable willreturn memory(none) }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir
index c0d1999..0903770 100644
--- a/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir
+++ b/llvm/test/CodeGen/AMDGPU/sched-crash-dbg-value.mir
@@ -181,6 +181,8 @@ legalized: false
regBankSelected: false
selected: false
tracksRegLiveness: true
+frameInfo:
+ adjustsStack: true
liveins:
- { reg: '$vgpr0', virtual-reg: '%0' }
- { reg: '$vgpr1', virtual-reg: '%1' }
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir
index efbdbca..c6ccbd9 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-wrong-stack-id.mir
@@ -78,6 +78,7 @@
name: sgpr_spill_wrong_stack_id
tracksRegLiveness: true
frameInfo:
+ adjustsStack: true
hasCalls: true
machineFunctionInfo:
scratchRSrcReg: $sgpr0_sgpr1_sgpr2_sgpr3
diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
index 764f494..f523b4a 100644
--- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
+++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll
@@ -16,7 +16,7 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 {
; GCN-LABEL: spill_sgpr_with_no_lower_vgpr_available:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s24, s33
+; GCN-NEXT: s_mov_b32 s18, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill
@@ -150,7 +150,6 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 {
; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
; GCN-NEXT: s_mov_b64 s[0:1], s[20:21]
; GCN-NEXT: s_mov_b64 s[2:3], s[22:23]
-; GCN-NEXT: ; implicit-def: $sgpr18_sgpr19
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_readlane_b32 s31, v255, 1
@@ -270,7 +269,7 @@ define void @spill_sgpr_with_no_lower_vgpr_available() #0 {
; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00
-; GCN-NEXT: s_mov_b32 s33, s24
+; GCN-NEXT: s_mov_b32 s33, s18
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
%alloca = alloca i32, align 4, addrspace(5)
@@ -311,7 +310,7 @@ define void @spill_to_lowest_available_vgpr() #0 {
; GCN-LABEL: spill_to_lowest_available_vgpr:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s24, s33
+; GCN-NEXT: s_mov_b32 s18, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1
; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill
@@ -444,7 +443,6 @@ define void @spill_to_lowest_available_vgpr() #0 {
; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
; GCN-NEXT: s_mov_b64 s[0:1], s[20:21]
; GCN-NEXT: s_mov_b64 s[2:3], s[22:23]
-; GCN-NEXT: ; implicit-def: $sgpr18_sgpr19
; GCN-NEXT: s_waitcnt lgkmcnt(0)
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: v_readlane_b32 s31, v254, 1
@@ -563,7 +561,7 @@ define void @spill_to_lowest_available_vgpr() #0 {
; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload
; GCN-NEXT: s_mov_b64 exec, s[4:5]
; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00
-; GCN-NEXT: s_mov_b32 s33, s24
+; GCN-NEXT: s_mov_b32 s33, s18
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
%alloca = alloca i32, align 4, addrspace(5)
@@ -1530,7 +1528,7 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 {
; GCN-LABEL: spill_sgpr_no_free_vgpr_ipra:
; GCN: ; %bb.0:
; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GCN-NEXT: s_mov_b32 s24, s33
+; GCN-NEXT: s_mov_b32 s18, s33
; GCN-NEXT: s_mov_b32 s33, s32
; GCN-NEXT: s_add_i32 s32, s32, 0x7400
; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill
@@ -1668,7 +1666,6 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 {
; GCN-NEXT: s_mov_b64 s[20:21], s[0:1]
; GCN-NEXT: s_mov_b64 s[0:1], s[20:21]
; GCN-NEXT: s_mov_b64 s[2:3], s[22:23]
-; GCN-NEXT: ; implicit-def: $sgpr18_sgpr19
; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-NEXT: s_mov_b64 s[4:5], exec
; GCN-NEXT: s_mov_b64 exec, 1
@@ -1801,7 +1798,7 @@ define void @spill_sgpr_no_free_vgpr_ipra() #0 {
; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload
; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload
; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00
-; GCN-NEXT: s_mov_b32 s33, s24
+; GCN-NEXT: s_mov_b32 s33, s18
; GCN-NEXT: s_waitcnt vmcnt(0)
; GCN-NEXT: s_setpc_b64 s[30:31]
call void @child_function_ipra()
diff --git a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
index f229f33..539cfc7 100644
--- a/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
+++ b/llvm/test/CodeGen/AMDGPU/simple-indirect-call.ll
@@ -73,7 +73,7 @@ define amdgpu_kernel void @test_simple_indirect_call() {
;.
; AKF_GCN: attributes #[[ATTR0]] = { "amdgpu-calls" "amdgpu-stack-objects" }
;.
-; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; ATTRIBUTOR_GCN: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
; ATTRIBUTOR_GCN: attributes #[[ATTR1]] = { "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir
index 3558298..f8ec6bb 100644
--- a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir
+++ b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir
@@ -21,6 +21,7 @@
name: kernel
tracksRegLiveness: true
frameInfo:
+ adjustsStack: true
hasCalls: true
machineFunctionInfo:
isEntryFunction: true
diff --git a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
index b8bc01e..c6a5990 100644
--- a/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
+++ b/llvm/test/CodeGen/AMDGPU/stacksave_stackrestore.ll
@@ -916,13 +916,13 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
; WAVE32-O0-LABEL: kernel_stacksave_stackrestore_call_with_stack_objects:
; WAVE32-O0: ; %bb.0:
; WAVE32-O0-NEXT: s_mov_b32 s32, 0x1200
-; WAVE32-O0-NEXT: s_getpc_b64 s[24:25]
-; WAVE32-O0-NEXT: s_mov_b32 s24, s0
-; WAVE32-O0-NEXT: s_load_dwordx4 s[24:27], s[24:25], 0x0
+; WAVE32-O0-NEXT: s_getpc_b64 s[20:21]
+; WAVE32-O0-NEXT: s_mov_b32 s20, s0
+; WAVE32-O0-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0
; WAVE32-O0-NEXT: s_waitcnt lgkmcnt(0)
-; WAVE32-O0-NEXT: s_bitset0_b32 s27, 21
-; WAVE32-O0-NEXT: s_add_u32 s24, s24, s9
-; WAVE32-O0-NEXT: s_addc_u32 s25, s25, 0
+; WAVE32-O0-NEXT: s_bitset0_b32 s23, 21
+; WAVE32-O0-NEXT: s_add_u32 s20, s20, s9
+; WAVE32-O0-NEXT: s_addc_u32 s21, s21, 0
; WAVE32-O0-NEXT: ; implicit-def: $vgpr3 : SGPR spill to VGPR lane
; WAVE32-O0-NEXT: s_mov_b32 s14, s8
; WAVE32-O0-NEXT: s_mov_b32 s13, s7
@@ -934,17 +934,17 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
; WAVE32-O0-NEXT: v_writelane_b32 v3, s0, 0
; WAVE32-O0-NEXT: s_lshr_b32 s0, s0, 5
; WAVE32-O0-NEXT: v_writelane_b32 v3, s0, 1
-; WAVE32-O0-NEXT: s_or_saveexec_b32 s20, -1
-; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[24:27], 0 offset:128 ; 4-byte Folded Spill
-; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s20
+; WAVE32-O0-NEXT: s_or_saveexec_b32 s19, -1
+; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], 0 offset:128 ; 4-byte Folded Spill
+; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s19
; WAVE32-O0-NEXT: v_mov_b32_e32 v3, 42
-; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[24:27], 0
+; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], 0
; WAVE32-O0-NEXT: s_waitcnt_vscnt null, 0x0
-; WAVE32-O0-NEXT: s_mov_b64 s[0:1], s[24:25]
-; WAVE32-O0-NEXT: s_mov_b64 s[2:3], s[26:27]
+; WAVE32-O0-NEXT: s_mov_b64 s[0:1], s[20:21]
+; WAVE32-O0-NEXT: s_mov_b64 s[2:3], s[22:23]
; WAVE32-O0-NEXT: s_mov_b32 s6, s32
; WAVE32-O0-NEXT: v_mov_b32_e32 v3, 17
-; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[24:27], s6 offset:4
+; WAVE32-O0-NEXT: buffer_store_dword v3, off, s[20:23], s6 offset:4
; WAVE32-O0-NEXT: s_mov_b32 s6, stack_passed_argument@abs32@hi
; WAVE32-O0-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo
; WAVE32-O0-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17
@@ -1018,11 +1018,10 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
; WAVE32-O0-NEXT: v_mov_b32_e32 v29, s18
; WAVE32-O0-NEXT: ; implicit-def: $sgpr18
; WAVE32-O0-NEXT: v_mov_b32_e32 v30, s18
-; WAVE32-O0-NEXT: ; implicit-def: $sgpr18_sgpr19
; WAVE32-O0-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; WAVE32-O0-NEXT: s_or_saveexec_b32 s20, -1
-; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:128 ; 4-byte Folded Reload
-; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s20
+; WAVE32-O0-NEXT: s_or_saveexec_b32 s19, -1
+; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[20:23], 0 offset:128 ; 4-byte Folded Reload
+; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s19
; WAVE32-O0-NEXT: s_waitcnt vmcnt(0)
; WAVE32-O0-NEXT: v_readlane_b32 s1, v0, 1
; WAVE32-O0-NEXT: v_readlane_b32 s0, v0, 0
@@ -1137,7 +1136,6 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
; WAVE64-O0-NEXT: v_mov_b32_e32 v29, s18
; WAVE64-O0-NEXT: ; implicit-def: $sgpr18
; WAVE64-O0-NEXT: v_mov_b32_e32 v30, s18
-; WAVE64-O0-NEXT: ; implicit-def: $sgpr18_sgpr19
; WAVE64-O0-NEXT: s_swappc_b64 s[30:31], s[16:17]
; WAVE64-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[24:27], 0 offset:128 ; 4-byte Folded Reload
@@ -1155,13 +1153,13 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
; WAVE32-WWM-PREALLOC-LABEL: kernel_stacksave_stackrestore_call_with_stack_objects:
; WAVE32-WWM-PREALLOC: ; %bb.0:
; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s32, 0x1200
-; WAVE32-WWM-PREALLOC-NEXT: s_getpc_b64 s[24:25]
-; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s24, s0
-; WAVE32-WWM-PREALLOC-NEXT: s_load_dwordx4 s[24:27], s[24:25], 0x0
+; WAVE32-WWM-PREALLOC-NEXT: s_getpc_b64 s[20:21]
+; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s20, s0
+; WAVE32-WWM-PREALLOC-NEXT: s_load_dwordx4 s[20:23], s[20:21], 0x0
; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt lgkmcnt(0)
-; WAVE32-WWM-PREALLOC-NEXT: s_bitset0_b32 s27, 21
-; WAVE32-WWM-PREALLOC-NEXT: s_add_u32 s24, s24, s9
-; WAVE32-WWM-PREALLOC-NEXT: s_addc_u32 s25, s25, 0
+; WAVE32-WWM-PREALLOC-NEXT: s_bitset0_b32 s23, 21
+; WAVE32-WWM-PREALLOC-NEXT: s_add_u32 s20, s20, s9
+; WAVE32-WWM-PREALLOC-NEXT: s_addc_u32 s21, s21, 0
; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $vgpr32 : SGPR spill to VGPR lane
; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s14, s8
; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s13, s7
@@ -1174,13 +1172,13 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
; WAVE32-WWM-PREALLOC-NEXT: s_lshr_b32 s0, s0, 5
; WAVE32-WWM-PREALLOC-NEXT: v_writelane_b32 v32, s0, 1
; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v3, 42
-; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[24:27], 0
+; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[20:23], 0
; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt_vscnt null, 0x0
-; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[0:1], s[24:25]
-; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[2:3], s[26:27]
+; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[0:1], s[20:21]
+; WAVE32-WWM-PREALLOC-NEXT: s_mov_b64 s[2:3], s[22:23]
; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s6, s32
; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v3, 17
-; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[24:27], s6 offset:4
+; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v3, off, s[20:23], s6 offset:4
; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s6, stack_passed_argument@abs32@hi
; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s16, stack_passed_argument@abs32@lo
; WAVE32-WWM-PREALLOC-NEXT: ; kill: def $sgpr16 killed $sgpr16 def $sgpr16_sgpr17
@@ -1254,7 +1252,6 @@ define amdgpu_kernel void @kernel_stacksave_stackrestore_call_with_stack_objects
; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v29, s18
; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr18
; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v30, s18
-; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr18_sgpr19
; WAVE32-WWM-PREALLOC-NEXT: s_swappc_b64 s[30:31], s[16:17]
; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s1, v32, 1
; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s0, v32, 0
@@ -1347,7 +1344,7 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
; WAVE32-O0-LABEL: func_stacksave_stackrestore_call_with_stack_objects:
; WAVE32-O0: ; %bb.0:
; WAVE32-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; WAVE32-O0-NEXT: s_mov_b32 s26, s33
+; WAVE32-O0-NEXT: s_mov_b32 s25, s33
; WAVE32-O0-NEXT: s_mov_b32 s33, s32
; WAVE32-O0-NEXT: s_xor_saveexec_b32 s16, -1
; WAVE32-O0-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
@@ -1361,9 +1358,9 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
; WAVE32-O0-NEXT: v_writelane_b32 v0, s16, 0
; WAVE32-O0-NEXT: s_lshr_b32 s16, s16, 5
; WAVE32-O0-NEXT: v_writelane_b32 v0, s16, 1
-; WAVE32-O0-NEXT: s_or_saveexec_b32 s25, -1
+; WAVE32-O0-NEXT: s_or_saveexec_b32 s24, -1
; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill
-; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s25
+; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s24
; WAVE32-O0-NEXT: v_mov_b32_e32 v0, 42
; WAVE32-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33
; WAVE32-O0-NEXT: s_waitcnt_vscnt null, 0x0
@@ -1440,11 +1437,10 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
; WAVE32-O0-NEXT: v_mov_b32_e32 v29, s18
; WAVE32-O0-NEXT: ; implicit-def: $sgpr18
; WAVE32-O0-NEXT: v_mov_b32_e32 v30, s18
-; WAVE32-O0-NEXT: ; implicit-def: $sgpr18_sgpr19
; WAVE32-O0-NEXT: s_swappc_b64 s[30:31], s[16:17]
-; WAVE32-O0-NEXT: s_or_saveexec_b32 s25, -1
+; WAVE32-O0-NEXT: s_or_saveexec_b32 s24, -1
; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
-; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s25
+; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s24
; WAVE32-O0-NEXT: s_waitcnt vmcnt(0)
; WAVE32-O0-NEXT: v_readlane_b32 s5, v0, 1
; WAVE32-O0-NEXT: v_readlane_b32 s4, v0, 0
@@ -1460,14 +1456,14 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
; WAVE32-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload
; WAVE32-O0-NEXT: s_mov_b32 exec_lo, s4
; WAVE32-O0-NEXT: s_add_i32 s32, s32, 0xffffee00
-; WAVE32-O0-NEXT: s_mov_b32 s33, s26
+; WAVE32-O0-NEXT: s_mov_b32 s33, s25
; WAVE32-O0-NEXT: s_waitcnt vmcnt(0)
; WAVE32-O0-NEXT: s_setpc_b64 s[30:31]
;
; WAVE64-O0-LABEL: func_stacksave_stackrestore_call_with_stack_objects:
; WAVE64-O0: ; %bb.0:
; WAVE64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; WAVE64-O0-NEXT: s_mov_b32 s28, s33
+; WAVE64-O0-NEXT: s_mov_b32 s19, s33
; WAVE64-O0-NEXT: s_mov_b32 s33, s32
; WAVE64-O0-NEXT: s_xor_saveexec_b64 s[16:17], -1
; WAVE64-O0-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
@@ -1560,7 +1556,6 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
; WAVE64-O0-NEXT: v_mov_b32_e32 v29, s18
; WAVE64-O0-NEXT: ; implicit-def: $sgpr18
; WAVE64-O0-NEXT: v_mov_b32_e32 v30, s18
-; WAVE64-O0-NEXT: ; implicit-def: $sgpr18_sgpr19
; WAVE64-O0-NEXT: s_swappc_b64 s[30:31], s[16:17]
; WAVE64-O0-NEXT: s_or_saveexec_b64 s[26:27], -1
; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
@@ -1580,14 +1575,14 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
; WAVE64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload
; WAVE64-O0-NEXT: s_mov_b64 exec, s[4:5]
; WAVE64-O0-NEXT: s_add_i32 s32, s32, 0xffffdc00
-; WAVE64-O0-NEXT: s_mov_b32 s33, s28
+; WAVE64-O0-NEXT: s_mov_b32 s33, s19
; WAVE64-O0-NEXT: s_waitcnt vmcnt(0)
; WAVE64-O0-NEXT: s_setpc_b64 s[30:31]
;
; WAVE32-WWM-PREALLOC-LABEL: func_stacksave_stackrestore_call_with_stack_objects:
; WAVE32-WWM-PREALLOC: ; %bb.0:
; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s25, s33
+; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s24, s33
; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s33, s32
; WAVE32-WWM-PREALLOC-NEXT: s_xor_saveexec_b32 s16, -1
; WAVE32-WWM-PREALLOC-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill
@@ -1677,7 +1672,6 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v29, s18
; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr18
; WAVE32-WWM-PREALLOC-NEXT: v_mov_b32_e32 v30, s18
-; WAVE32-WWM-PREALLOC-NEXT: ; implicit-def: $sgpr18_sgpr19
; WAVE32-WWM-PREALLOC-NEXT: s_swappc_b64 s[30:31], s[16:17]
; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s5, v32, 1
; WAVE32-WWM-PREALLOC-NEXT: v_readlane_b32 s4, v32, 0
@@ -1693,7 +1687,7 @@ define void @func_stacksave_stackrestore_call_with_stack_objects() {
; WAVE32-WWM-PREALLOC-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload
; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 exec_lo, s4
; WAVE32-WWM-PREALLOC-NEXT: s_add_i32 s32, s32, 0xffffee00
-; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s33, s25
+; WAVE32-WWM-PREALLOC-NEXT: s_mov_b32 s33, s24
; WAVE32-WWM-PREALLOC-NEXT: s_waitcnt vmcnt(0)
; WAVE32-WWM-PREALLOC-NEXT: s_setpc_b64 s[30:31]
%alloca = alloca [32 x i32], addrspace(5)
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
index 8d5dc79..049db01 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-attribute-missing.ll
@@ -31,6 +31,6 @@ define amdgpu_kernel void @kernel1() #1 {
attributes #0 = { "uniform-work-group-size"="true" }
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll
index 7a6f82d..c9387f1 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-multistep.ll
@@ -98,7 +98,7 @@ define amdgpu_kernel void @kernel2() #0 {
attributes #0 = { "uniform-work-group-size"="true" }
;.
; CHECK: attributes #[[ATTR0]] = { "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR2]] = { "uniform-work-group-size"="true" }
-; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR3]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
index c04154c..7183da2 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-nested-function-calls.ll
@@ -41,6 +41,6 @@ define amdgpu_kernel void @kernel3() #2 {
attributes #2 = { "uniform-work-group-size"="true" }
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
index 2d5ff04..6ed04cf 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-prevent-attribute-propagation.ll
@@ -41,7 +41,7 @@ define amdgpu_kernel void @kernel2() #2 {
attributes #1 = { "uniform-work-group-size"="true" }
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
index e8bf6fc..d5ba2fd 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-propagate-attribute.ll
@@ -52,8 +52,8 @@ attributes #0 = { nounwind }
attributes #1 = { "uniform-work-group-size"="false" }
attributes #2 = { "uniform-work-group-size"="true" }
;.
-; CHECK: attributes #[[ATTR0]] = { nounwind "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { nounwind "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR2]] = { nounwind "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
; CHECK: attributes #[[ATTR3]] = { "uniform-work-group-size"="true" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
index 473eea4..7f0dfea 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-recursion-test.ll
@@ -101,7 +101,7 @@ define amdgpu_kernel void @kernel(ptr addrspace(1) %m) #1 {
attributes #0 = { nounwind readnone }
attributes #1 = { "uniform-work-group-size"="true" }
;.
-; CHECK: attributes #[[ATTR0]] = { nounwind memory(none) "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
-; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR0]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { nounwind memory(none) "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
+; CHECK: attributes #[[ATTR2]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="true" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll
index 221f1a1..8616c73 100644
--- a/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll
+++ b/llvm/test/CodeGen/AMDGPU/uniform-work-group-test.ll
@@ -61,6 +61,6 @@ define amdgpu_kernel void @kernel3() #0 {
attributes #0 = { "uniform-work-group-size"="false" }
;.
-; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
-; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR0]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "amdgpu-waves-per-eu"="4,10" "uniform-work-group-size"="false" }
+; CHECK: attributes #[[ATTR1]] = { "amdgpu-no-agpr" "amdgpu-no-completion-action" "amdgpu-no-default-queue" "amdgpu-no-dispatch-id" "amdgpu-no-dispatch-ptr" "amdgpu-no-heap-ptr" "amdgpu-no-hostcall-ptr" "amdgpu-no-implicitarg-ptr" "amdgpu-no-lds-kernel-id" "amdgpu-no-multigrid-sync-arg" "amdgpu-no-queue-ptr" "amdgpu-no-workgroup-id-x" "amdgpu-no-workgroup-id-y" "amdgpu-no-workgroup-id-z" "amdgpu-no-workitem-id-x" "amdgpu-no-workitem-id-y" "amdgpu-no-workitem-id-z" "uniform-work-group-size"="false" }
;.
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
index 717d3d9..0407994 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-agpr-limit-gfx90a.ll
@@ -540,6 +540,7 @@ define internal void @use512vgprs() {
}
define void @foo() #0 {
+ call void asm sideeffect "; use $0", "a"(i32 0)
ret void
}
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
index d2364a6..bfc249e 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr-liverange-ir.ll
@@ -233,10 +233,10 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun
; SI-NEXT: bb.1.Flow:
; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000)
; SI-NEXT: {{ $}}
- ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %49:vgpr_32, %bb.0, %4, %bb.9
- ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %51:vgpr_32, %bb.9
- ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %53:vgpr_32, %bb.9
- ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %55:vgpr_32, %bb.9
+ ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %47:vgpr_32, %bb.0, %4, %bb.9
+ ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY4]], %bb.0, undef %49:vgpr_32, %bb.9
+ ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %51:vgpr_32, %bb.9
+ ; SI-NEXT: [[PHI3:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %53:vgpr_32, %bb.9
; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; SI-NEXT: S_BRANCH %bb.2
; SI-NEXT: {{ $}}
@@ -249,8 +249,8 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun
; SI-NEXT: bb.3:
; SI-NEXT: successors: %bb.4(0x80000000)
; SI-NEXT: {{ $}}
- ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %57:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2
- ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %59:vgpr_32, %bb.4, [[PHI1]], %bb.2
+ ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %55:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2
+ ; SI-NEXT: [[PHI5:%[0-9]+]]:vgpr_32 = PHI undef %57:vgpr_32, %bb.4, [[PHI1]], %bb.2
; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec
; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec
; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1
@@ -286,8 +286,8 @@ define amdgpu_ps float @loop(i32 %z, float %v, i32 inreg %bound, ptr %extern_fun
; SI-NEXT: bb.7:
; SI-NEXT: successors: %bb.8(0x80000000)
; SI-NEXT: {{ $}}
- ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %61:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6
- ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %63:vgpr_32, %bb.8, [[COPY4]], %bb.6
+ ; SI-NEXT: [[PHI6:%[0-9]+]]:vreg_64 = PHI undef %59:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6
+ ; SI-NEXT: [[PHI7:%[0-9]+]]:vgpr_32 = PHI undef %61:vgpr_32, %bb.8, [[COPY4]], %bb.6
; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub0, implicit $exec
; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI6]].sub1, implicit $exec
; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1
@@ -356,9 +356,9 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e
; SI-NEXT: bb.1.Flow:
; SI-NEXT: successors: %bb.2(0x40000000), %bb.10(0x40000000)
; SI-NEXT: {{ $}}
- ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %50:vgpr_32, %bb.0, %4, %bb.9
- ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %52:vgpr_32, %bb.9
- ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %54:vgpr_32, %bb.9
+ ; SI-NEXT: [[PHI:%[0-9]+]]:vgpr_32 = PHI undef %48:vgpr_32, %bb.0, %4, %bb.9
+ ; SI-NEXT: [[PHI1:%[0-9]+]]:vgpr_32 = PHI [[COPY3]], %bb.0, undef %50:vgpr_32, %bb.9
+ ; SI-NEXT: [[PHI2:%[0-9]+]]:vgpr_32 = PHI [[COPY2]], %bb.0, undef %52:vgpr_32, %bb.9
; SI-NEXT: [[SI_ELSE:%[0-9]+]]:sreg_32 = SI_ELSE killed [[SI_IF]], %bb.10, implicit-def dead $exec, implicit-def dead $scc, implicit $exec
; SI-NEXT: S_BRANCH %bb.2
; SI-NEXT: {{ $}}
@@ -371,7 +371,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e
; SI-NEXT: bb.3:
; SI-NEXT: successors: %bb.4(0x80000000)
; SI-NEXT: {{ $}}
- ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %56:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2
+ ; SI-NEXT: [[PHI3:%[0-9]+]]:vreg_64 = PHI undef %54:vreg_64, %bb.4, [[REG_SEQUENCE]], %bb.2
; SI-NEXT: [[V_READFIRSTLANE_B32_:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub0, implicit $exec
; SI-NEXT: [[V_READFIRSTLANE_B32_1:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI3]].sub1, implicit $exec
; SI-NEXT: [[REG_SEQUENCE1:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_1]], %subreg.sub1
@@ -407,7 +407,7 @@ define amdgpu_ps float @loop_with_use(i32 %z, float %v, i32 inreg %bound, ptr %e
; SI-NEXT: bb.7:
; SI-NEXT: successors: %bb.8(0x80000000)
; SI-NEXT: {{ $}}
- ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %58:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6
+ ; SI-NEXT: [[PHI4:%[0-9]+]]:vreg_64 = PHI undef %56:vreg_64, %bb.8, [[REG_SEQUENCE2]], %bb.6
; SI-NEXT: [[V_READFIRSTLANE_B32_2:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub0, implicit $exec
; SI-NEXT: [[V_READFIRSTLANE_B32_3:%[0-9]+]]:sgpr_32 = V_READFIRSTLANE_B32 [[PHI4]].sub1, implicit $exec
; SI-NEXT: [[REG_SEQUENCE3:%[0-9]+]]:sgpr_64 = REG_SEQUENCE killed [[V_READFIRSTLANE_B32_2]], %subreg.sub0, killed [[V_READFIRSTLANE_B32_3]], %subreg.sub1
diff --git a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
index 37f207f..4939d526 100644
--- a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
+++ b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll
@@ -47,7 +47,6 @@ define protected amdgpu_kernel void @kern(ptr %addr) !llvm.amdgcn.lds.kernel.id
; CHECK-NEXT: s_mov_b32 s15, 42
; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21]
; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23]
-; CHECK-NEXT: ; implicit-def: $sgpr18_sgpr19
; CHECK-NEXT: s_waitcnt lgkmcnt(0)
; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17]
; CHECK-NEXT: s_endpgm
diff --git a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir
index 3d9db68..6659e95 100644
--- a/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir
+++ b/llvm/test/CodeGen/AMDGPU/virtregrewrite-undef-identity-copy.mir
@@ -20,6 +20,7 @@ name: undef_identity_copy
tracksRegLiveness: true
frameInfo:
maxAlignment: 4
+ adjustsStack: true
hasCalls: true
machineFunctionInfo:
isEntryFunction: true
diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll
index 82816b4..901e88a 100644
--- a/llvm/test/CodeGen/AMDGPU/wave32.ll
+++ b/llvm/test/CodeGen/AMDGPU/wave32.ll
@@ -2479,8 +2479,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) {
; GFX1032-NEXT: v_mul_f32_e32 v1, 0x4f7ffffe, v1
; GFX1032-NEXT: v_cvt_u32_f32_e32 v1, v1
; GFX1032-NEXT: v_mul_lo_u32 v2, s1, v1
-; GFX1032-NEXT: s_ff1_i32_b32 s1, 0x80000000
-; GFX1032-NEXT: s_add_i32 s1, s1, 32
+; GFX1032-NEXT: s_brev_b32 s1, 1
; GFX1032-NEXT: v_mul_hi_u32 v2, v1, v2
; GFX1032-NEXT: v_add_nc_u32_e32 v1, v1, v2
; GFX1032-NEXT: v_mul_hi_u32 v1, v0, v1
@@ -2494,8 +2493,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) {
; GFX1032-NEXT: v_cndmask_b32_e32 v0, v0, v1, vcc_lo
; GFX1032-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1
-; GFX1032-NEXT: s_ff1_i32_b32 s0, s0
-; GFX1032-NEXT: s_min_u32 s0, s0, s1
+; GFX1032-NEXT: s_ff1_i32_b64 s0, s[0:1]
; GFX1032-NEXT: s_cmp_gt_u32 s0, 9
; GFX1032-NEXT: s_cselect_b32 s0, -1, 0
; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -2529,10 +2527,7 @@ define amdgpu_kernel void @icmp64(i32 %n, i32 %s) {
; GFX1064-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0
; GFX1064-NEXT: s_lshr_b64 s[0:1], vcc, 1
; GFX1064-NEXT: s_bitset1_b32 s1, 31
-; GFX1064-NEXT: s_ff1_i32_b32 s0, s0
-; GFX1064-NEXT: s_ff1_i32_b32 s1, s1
-; GFX1064-NEXT: s_add_i32 s1, s1, 32
-; GFX1064-NEXT: s_min_u32 s0, s0, s1
+; GFX1064-NEXT: s_ff1_i32_b64 s0, s[0:1]
; GFX1064-NEXT: s_cmp_gt_u32 s0, 9
; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
@@ -2576,9 +2571,8 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) {
; GFX1032-NEXT: s_waitcnt lgkmcnt(0)
; GFX1032-NEXT: v_div_scale_f32 v1, s1, s0, s0, v0
; GFX1032-NEXT: v_div_scale_f32 v4, vcc_lo, v0, s0, v0
-; GFX1032-NEXT: s_ff1_i32_b32 s1, 0x80000000
+; GFX1032-NEXT: s_brev_b32 s1, 1
; GFX1032-NEXT: v_rcp_f32_e32 v2, v1
-; GFX1032-NEXT: s_add_i32 s1, s1, 32
; GFX1032-NEXT: v_fma_f32 v3, -v1, v2, 1.0
; GFX1032-NEXT: v_fmac_f32_e32 v2, v3, v2
; GFX1032-NEXT: v_mul_f32_e32 v3, v4, v2
@@ -2592,8 +2586,7 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) {
; GFX1032-NEXT: v_cmp_eq_f32_e32 vcc_lo, 0, v0
; GFX1032-NEXT: s_lshr_b32 s0, vcc_lo, 1
; GFX1032-NEXT: v_cmp_nlg_f32_e32 vcc_lo, 0, v0
-; GFX1032-NEXT: s_ff1_i32_b32 s0, s0
-; GFX1032-NEXT: s_min_u32 s0, s0, s1
+; GFX1032-NEXT: s_ff1_i32_b64 s0, s[0:1]
; GFX1032-NEXT: s_cmp_gt_u32 s0, 9
; GFX1032-NEXT: s_cselect_b32 s0, -1, 0
; GFX1032-NEXT: s_and_b32 s0, vcc_lo, s0
@@ -2609,15 +2602,15 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) {
; GFX1064-NEXT: v_cvt_f32_u32_e32 v0, v0
; GFX1064-NEXT: s_waitcnt lgkmcnt(0)
; GFX1064-NEXT: v_div_scale_f32 v1, s[0:1], s2, s2, v0
-; GFX1064-NEXT: v_div_scale_f32 v4, vcc, v0, s2, v0
; GFX1064-NEXT: v_rcp_f32_e32 v2, v1
; GFX1064-NEXT: v_fma_f32 v3, -v1, v2, 1.0
; GFX1064-NEXT: v_fmac_f32_e32 v2, v3, v2
-; GFX1064-NEXT: v_mul_f32_e32 v3, v4, v2
-; GFX1064-NEXT: v_fma_f32 v5, -v1, v3, v4
-; GFX1064-NEXT: v_fmac_f32_e32 v3, v5, v2
-; GFX1064-NEXT: v_fma_f32 v1, -v1, v3, v4
-; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v3
+; GFX1064-NEXT: v_div_scale_f32 v3, vcc, v0, s2, v0
+; GFX1064-NEXT: v_mul_f32_e32 v4, v3, v2
+; GFX1064-NEXT: v_fma_f32 v5, -v1, v4, v3
+; GFX1064-NEXT: v_fmac_f32_e32 v4, v5, v2
+; GFX1064-NEXT: v_fma_f32 v1, -v1, v4, v3
+; GFX1064-NEXT: v_div_fmas_f32 v1, v1, v2, v4
; GFX1064-NEXT: v_div_fixup_f32 v1, v1, s2, v0
; GFX1064-NEXT: v_trunc_f32_e32 v1, v1
; GFX1064-NEXT: v_fma_f32 v0, -v1, s2, v0
@@ -2625,10 +2618,7 @@ define amdgpu_kernel void @fcmp64(float %n, float %s) {
; GFX1064-NEXT: s_lshr_b64 s[0:1], vcc, 1
; GFX1064-NEXT: v_cmp_nlg_f32_e32 vcc, 0, v0
; GFX1064-NEXT: s_bitset1_b32 s1, 31
-; GFX1064-NEXT: s_ff1_i32_b32 s0, s0
-; GFX1064-NEXT: s_ff1_i32_b32 s1, s1
-; GFX1064-NEXT: s_add_i32 s1, s1, 32
-; GFX1064-NEXT: s_min_u32 s0, s0, s1
+; GFX1064-NEXT: s_ff1_i32_b64 s0, s[0:1]
; GFX1064-NEXT: s_cmp_gt_u32 s0, 9
; GFX1064-NEXT: s_cselect_b64 s[0:1], -1, 0
; GFX1064-NEXT: s_and_b64 s[0:1], vcc, s[0:1]
diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
index 3a33194..7eabe98 100644
--- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll
@@ -101,7 +101,6 @@ define void @test() #0 {
; GCN-O0-NEXT: s_mov_b64 s[20:21], s[0:1]
; GCN-O0-NEXT: s_mov_b64 s[0:1], s[20:21]
; GCN-O0-NEXT: s_mov_b64 s[2:3], s[22:23]
-; GCN-O0-NEXT: ; implicit-def: $sgpr18_sgpr19
; GCN-O0-NEXT: s_waitcnt lgkmcnt(0)
; GCN-O0-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GCN-O0-NEXT: s_or_saveexec_b64 s[28:29], -1
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
index 11f6a29..e79cb66 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll
@@ -406,7 +406,6 @@ define amdgpu_gfx void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 inreg
; GFX9-O0-NEXT: s_mov_b64 s[0:1], s[44:45]
; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[46:47]
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2
-; GFX9-O0-NEXT: ; implicit-def: $sgpr44_sgpr45
; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[42:43]
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v0
; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2
@@ -633,7 +632,6 @@ define amdgpu_gfx void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i64 i
; GFX9-O0-NEXT: s_mov_b64 s[2:3], s[38:39]
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3
-; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[34:35]
; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1
diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
index e5cebc1..def51f2 100644
--- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
+++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll
@@ -413,7 +413,6 @@ define amdgpu_kernel void @call(ptr addrspace(8) inreg %tmp14, i32 inreg %arg) {
; GFX9-O0-NEXT: ; implicit-def: $sgpr15
; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6
-; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19
; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -657,7 +656,6 @@ define amdgpu_kernel void @call_i64(ptr addrspace(8) inreg %tmp14, i64 inreg %ar
; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7
-; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
@@ -1285,7 +1283,6 @@ define amdgpu_kernel void @strict_wwm_call(ptr addrspace(8) inreg %tmp14, i32 in
; GFX9-O0-NEXT: ; implicit-def: $sgpr15
; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6
-; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19
; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1
; GFX9-O0-NEXT: buffer_load_dword v1, off, s[24:27], 0 ; 4-byte Folded Reload
@@ -1529,7 +1526,6 @@ define amdgpu_kernel void @strict_wwm_call_i64(ptr addrspace(8) inreg %tmp14, i6
; GFX9-O0-NEXT: v_mov_b32_e32 v31, v3
; GFX9-O0-NEXT: v_mov_b32_e32 v0, v6
; GFX9-O0-NEXT: v_mov_b32_e32 v1, v7
-; GFX9-O0-NEXT: ; implicit-def: $sgpr18_sgpr19
; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0)
; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[16:17]
; GFX9-O0-NEXT: s_or_saveexec_b64 s[20:21], -1